# Breast Cancer Classification with Logistic Regression & Gradient Boosting
This notebook performs the following:
- Basic EDA (mean, median, heatmap)
- Model training: Logistic Regression & GBM
- Evaluation: Accuracy, ROC AUC, KS Statistic
- Visualizations: Histogram and KS Curve

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from scipy.stats import ks_2samp

In [None]:
# Load dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.head()

In [None]:
# Mean and Median
print("=== Mean of Features ===")
print(df.mean().round(2))

print("\n=== Median of Features ===")
print(df.median().round(2))

In [None]:
# Target Distribution
sns.countplot(x='target', data=df)
plt.title('Target Class Distribution')
plt.xticks([0, 1], ['Malignant', 'Benign'])
plt.show()

In [None]:
# Mean vs Median
mean_vals = df.mean()
median_vals = df.median()
plt.figure(figsize=(12, 4))
mean_vals.drop('target').plot(label='Mean', linestyle='--')
median_vals.drop('target').plot(label='Median', alpha=0.7)
plt.title('Mean vs Median of Features')
plt.legend()
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Train-test split
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
log_preds = log_reg.predict(X_test_scaled)
log_proba = log_reg.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Gradient Boosting
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)
gbm_preds = gbm.predict(X_test)
gbm_proba = gbm.predict_proba(X_test)[:, 1]

In [None]:
# KS Function
def ks_score(y_true, y_pred_proba):
    return ks_2samp(y_pred_proba[y_true == 1], y_pred_proba[y_true == 0]).statistic

In [None]:
# Model Evaluation
print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, log_preds))
print("ROC AUC:", roc_auc_score(y_test, log_proba))
print("KS Statistic:", ks_score(y_test, log_proba))
print(classification_report(y_test, log_preds))

print("=== Gradient Boosting ===")
print("Accuracy:", accuracy_score(y_test, gbm_preds))
print("ROC AUC:", roc_auc_score(y_test, gbm_proba))
print("KS Statistic:", ks_score(y_test, gbm_proba))
print(classification_report(y_test, gbm_preds))

In [None]:
# Predicted Probabilities Histogram
plt.figure(figsize=(8, 4))
sns.histplot(log_proba, color='blue', label='Logistic Regression', kde=True, stat='density', bins=25)
sns.histplot(gbm_proba, color='green', label='Gradient Boosting', kde=True, stat='density', bins=25, alpha=0.6)
plt.title('Predicted Probabilities Histogram')
plt.xlabel('Predicted Probability')
plt.legend()
plt.show()

In [None]:
# KS Curve
def plot_ks_curve(y_true, y_proba, model_name):
    data = pd.DataFrame({'y': y_true, 'proba': y_proba})
    data = data.sort_values(by='proba', ascending=False)
    data['cum_pct_total'] = np.arange(len(data)) / len(data)
    data['cum_pct_pos'] = data['y'].cumsum() / data['y'].sum()
    data['cum_pct_neg'] = ((1 - data['y']).cumsum()) / (1 - data['y']).sum()
    data['ks'] = np.abs(data['cum_pct_pos'] - data['cum_pct_neg'])

    plt.figure(figsize=(7, 4))
    plt.plot(data['cum_pct_total'], data['cum_pct_pos'], label='Positive CDF')
    plt.plot(data['cum_pct_total'], data['cum_pct_neg'], label='Negative CDF')
    plt.title(f"KS Curve - {model_name}")
    plt.xlabel('Cumulative % of Population')
    plt.ylabel('Cumulative % of Class')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_ks_curve(y_test.values, log_proba, 'Logistic Regression')
plot_ks_curve(y_test.values, gbm_proba, 'Gradient Boosting')