In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.datasets import load_breast_cancer

# --- 1. Setup: Load Dataset ---
print("1. Loading the Breast Cancer Wisconsin dataset...")
data = load_breast_cancer(as_frame=True)
X = data.data
y = data.target

# Check for class imbalance
print("\nChecking for class imbalance:")
print(y.value_counts())
# You'll see that class 1 (benign) has more samples than class 0 (malignant).

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print("\nDataset split successfully.")

# --- 2. Preprocessing: Scale the features ---
print("\n2. Scaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features scaled successfully.")

# --- 3. Modeling and Evaluation (Without Imbalance Handling) ---
print("\n3. Training models without imbalance handling...")
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM (C-Support Vector Classification)": SVC(random_state=42),
    "k-NN (k-Nearest Neighbors)": KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"\n--- {name} Results ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))

# --- 4. Addressing Class Imbalance ---
print("\n--- 4. Addressing Class Imbalance ---")

# Method 1: Use class_weight='balanced' for some models
print("\nUsing class_weight='balanced' parameter:")
balanced_models = {
    "Logistic Regression (Balanced)": LogisticRegression(class_weight='balanced', random_state=42),
    "Random Forest (Balanced)": RandomForestClassifier(class_weight='balanced', random_state=42),
    "SVM (Balanced)": SVC(class_weight='balanced', random_state=42)
}

for name, model in balanced_models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"\n--- {name} Results ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Method 2: Use SMOTE (Synthetic Minority Oversampling Technique) for resampling
# Requires installing the imbalanced-learn library: `pip install imbalanced-learn`
try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline

    print("\nUsing SMOTE (Oversampling) with models:")

    # Create a pipeline that first oversamples and then trains the model
    smote_models = {
        "Logistic Regression (SMOTE)": ImbPipeline([('smote', SMOTE(random_state=42)), ('classifier', LogisticRegression(random_state=42))]),
        "Decision Tree (SMOTE)": ImbPipeline([('smote', SMOTE(random_state=42)), ('classifier', DecisionTreeClassifier(random_state=42))]),
        "Random Forest (SMOTE)": ImbPipeline([('smote', SMOTE(random_state=42)), ('classifier', RandomForestClassifier(random_state=42))])
    }

    for name, model in smote_models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        print(f"\n--- {name} Results ---")
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print("Classification Report:\n", classification_report(y_test, y_pred))

except ImportError:
    print("\nSkipping SMOTE implementation. Please install `imblearn` library to run this part.")
    print("Run: pip install imbalanced-learn")

print("\n--- Process Complete ---")

# --- 5. Analysis and Next Steps ---
# Compare the results, especially the precision, recall, and f1-score for the minority class (class 0).
# The class imbalance handling techniques should generally improve the recall for the minority class.
# For example, look at the Logistic Regression results with and without 'class_weight'.


1. Loading the Breast Cancer Wisconsin dataset...

Checking for class imbalance:
target
1    357
0    212
Name: count, dtype: int64

Dataset split successfully.

2. Scaling features...
Features scaled successfully.

3. Training models without imbalance handling...

--- Logistic Regression Results ---
Accuracy: 0.9883
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        64
           1       0.99      0.99      0.99       107

    accuracy                           0.99       171
   macro avg       0.99      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171


--- Decision Tree Results ---
Accuracy: 0.9181
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89        64
           1       0.93      0.93      0.93       107

    accuracy                           0.92       171
   macro avg       0.91      0.91      0