In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import KernelDensity

In [3]:
data = load_breast_cancer()
X, y = data.data, data.target

In [4]:
# Split and scale data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Custom ROSE implementation
def apply_ROSE(X, y, minority_class=0, random_state=None):
    X_min = X[y == minority_class]
    if len(X_min) == 0:
        return X, y
    
    # Calculate samples to generate
    n_majority = sum(y != minority_class)
    n_to_generate = n_majority - len(X_min)
    
    if n_to_generate <= 0:
        return X, y
    
    # Generate synthetic samples using KDE
    kde = KernelDensity(kernel='gaussian', bandwidth=0.5)
    kde.fit(X_min)
    synthetic_samples = kde.sample(n_to_generate, random_state=random_state)
    
    return (
        np.vstack([X, synthetic_samples]),
        np.hstack([y, np.full(n_to_generate, minority_class)])
)

In [6]:
# Define models and resamplers
models = {
    'KNN': KNeighborsClassifier(),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(random_state=42)
}

resamplers = {
    'ROS': RandomOverSampler(random_state=42),
    'RUS': RandomUnderSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'ROSE': None  # Handled separately
}


In [7]:
# Evaluate models
results = []

for method in resamplers:
    if method == 'ROSE':
        X_res, y_res = apply_ROSE(X_train_scaled, y_train)
    else:
        X_res, y_res = resamplers[method].fit_resample(X_train_scaled, y_train)
    
    for model_name in models:
        model = models[model_name].fit(X_res, y_res)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 0]
        
        # Calculate metrics
        precision = precision_score(y_test, y_pred, pos_label=0)
        recall = recall_score(y_test, y_pred, pos_label=0)
        auc = roc_auc_score(y_test, y_proba)
        
        # Calculate G-Mean
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        specificity = tn / (tn + fp)
        gmean = np.sqrt(recall * specificity)
        
        results.append({
            'Method': method,
            'Model': model_name,
            'Precision': round(precision, 2),
            'Recall': round(recall, 2),
            'G-Mean': round(gmean, 2),
            'AUC': round(auc, 2)
        })

In [9]:
 
results_df = pd.DataFrame(results)
print(results_df.sort_values(by='AUC', ascending=False))


   Method Model  Precision  Recall  G-Mean   AUC
2     ROS    DT       0.90    0.86    0.86  0.10
11   ROSE    DT       0.82    0.95    0.95  0.09
8   SMOTE    DT       0.91    0.93    0.93  0.06
5     RUS    DT       0.95    0.93    0.93  0.05
0     ROS   KNN       0.89    0.95    0.95  0.02
6   SMOTE   KNN       0.87    0.93    0.93  0.02
9    ROSE   KNN       0.93    0.93    0.93  0.02
1     ROS    NB       0.90    0.90    0.90  0.01
3     RUS   KNN       0.89    0.98    0.98  0.01
4     RUS    NB       0.90    0.90    0.90  0.01
7   SMOTE    NB       0.93    0.90    0.90  0.01
10   ROSE    NB       0.90    0.90    0.90  0.01
