In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.mixture import GaussianMixture
from imblearn.over_sampling import ADASYN
from collections import Counter
from scipy.spatial.distance import cdist
import seaborn as sa
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
# Step 2: Load the Dataset
file_path = "breast-cancer.csv"  # Adjust file path if needed
df = pd.read_csv(file_path)

In [3]:
# Step 3: Inspect the Dataset
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
                  Class    Age Menopause Tumor_Size Inv_Nodes Node_Caps  \
0  no-recurrence-events  30-39   premeno      30-34       0-2        no   
1  no-recurrence-events  40-49   premeno      20-24       0-2        no   
2  no-recurrence-events  40-49   premeno      20-24       0-2        no   
3  no-recurrence-events  60-69      ge40      15-19       0-2        no   
4  no-recurrence-events  40-49   premeno        0-4       0-2        no   

   Deg_Malig Breast Breast_Quad Irradiat  
0          3   left    left_low       no  
1          2  right    right_up       no  
2          2   left    left_low       no  
3          2  right     left_up       no  
4          2  right   right_low       no  


In [4]:
print("\nColumn Names:")


Column Names:


In [5]:
print(df.columns)

Index(['Class', 'Age', 'Menopause', 'Tumor_Size', 'Inv_Nodes', 'Node_Caps',
       'Deg_Malig', 'Breast', 'Breast_Quad', 'Irradiat'],
      dtype='object')


In [6]:
# Step 4: Identify the Target Column ("Class")
if 'Class' in df.columns:
    target_column = 'Class'
else:
    raise ValueError("Class column not found! Please check the dataset.")

In [7]:
# Step 5: Encode categorical variables
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

In [8]:
# Step 6: Split Data into Training & Testing Sets
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Step 7: Data Preprocessing (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Step 8: Check Class Imbalance
print("\nClass Distribution Before ADASYN:")
print(Counter(y_train))


Class Distribution Before ADASYN:
Counter({0: 164, 1: 64})


In [11]:
# Step 9: Apply Gaussian Mixture Model (GMM)
k = 5  # Number of clusters
gmm = GaussianMixture(n_components=k, random_state=42)
clusters = gmm.fit_predict(X_train)

In [12]:
print("\nClusters:", clusters)


Clusters: [2 2 3 4 2 0 3 2 2 4 2 3 0 3 2 0 4 3 1 2 0 3 2 2 3 3 2 3 1 3 4 3 2 2 2 3 3
 3 2 0 3 2 2 4 2 1 0 0 2 0 0 3 3 0 3 4 2 1 1 3 2 2 4 3 2 3 4 2 1 0 0 2 3 2
 3 2 4 3 3 1 0 1 3 3 3 1 2 2 1 3 4 2 2 3 2 3 4 3 0 2 4 2 0 2 2 3 0 0 2 3 1
 0 2 0 3 3 2 3 3 2 2 3 2 2 3 2 3 2 3 0 0 0 2 1 4 2 3 2 3 3 3 2 3 2 0 2 1 2
 2 0 2 2 2 2 0 0 2 2 2 0 0 3 3 2 3 3 2 4 2 2 4 3 2 1 3 3 2 3 3 3 3 3 0 2 2
 2 0 3 3 3 2 2 0 3 4 3 3 3 3 3 3 4 2 3 3 3 3 2 2 2 3 2 2 2 0 0 3 2 3 3 3 3
 3 1 2 2 4 2]


In [13]:
# Step 10: Filter Clusters with More Minority Instances
df_train_clustered = X_train.copy()
df_train_clustered['cluster'] = clusters
df_train_clustered[target_column] = y_train

In [14]:
filtered_clusters = []
for c in np.unique(clusters):
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == c]
    minority_count = cluster_data[target_column].sum()
    majority_count = len(cluster_data) - minority_count
    imbalance_ratio = (majority_count + 1) / (minority_count + 1)
    if imbalance_ratio < 1.5:
        filtered_clusters.append(c)

In [15]:
print("\nFiltered Clusters:", filtered_clusters)


Filtered Clusters: [0, 4]


In [16]:
# Step 11: Compute Sampling Weight for Each Cluster
def compute_sampling_weight(cluster_data, target_column, minority_count, X_train):
    feature_data = cluster_data.drop(columns=['cluster', target_column]).values
    
    # Compute pairwise Euclidean distances
    pairwise_distances = cdist(feature_data, feature_data, metric='euclidean')
    
    # Take the mean of the upper triangle (excluding diagonal)
    avg_minority_dist = np.mean(pairwise_distances[np.triu_indices(len(feature_data), k=1)])
    
    print(f"Cluster {cluster_data['cluster'].iloc[0]} - Avg Euclidean Distance:", avg_minority_dist)
    
    density_factor = minority_count / (avg_minority_dist ** X_train.shape[1])
    sparsity_factor = 1 / density_factor
    
    return sparsity_factor

In [17]:
sparsity_sum = sum(compute_sampling_weight(
    df_train_clustered[df_train_clustered['cluster'] == f], 
    target_column, 
    len(df_train_clustered[df_train_clustered['cluster'] == f]), 
    X_train
) for f in filtered_clusters)

Cluster 0 - Avg Euclidean Distance: 4.125837518638979
Cluster 4 - Avg Euclidean Distance: 3.5113134343470382


In [18]:
sampling_weights = {
    f: compute_sampling_weight(df_train_clustered[df_train_clustered['cluster'] == f], target_column, len(df_train_clustered[df_train_clustered['cluster'] == f]), X_train) / sparsity_sum
    for f in filtered_clusters
}

Cluster 0 - Avg Euclidean Distance: 4.125837518638979
Cluster 4 - Avg Euclidean Distance: 3.5113134343470382


In [19]:
print("\nSampling Weights:", sampling_weights)


Sampling Weights: {0: 0.6995958370205877, 4: 0.30040416297941225}


In [20]:
# Step 12: Apply ADASYN for Oversampling
X_resampled = pd.DataFrame()
y_resampled = pd.Series(dtype=int)

In [21]:
for cluster_id in filtered_clusters:
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == cluster_id]
    X_cluster = cluster_data.drop(columns=['cluster', target_column])
    y_cluster = cluster_data[target_column]
    
    if len(y_cluster) < 2:  # Skip clusters with fewer than 2 samples
        continue
    
    num_samples = int(len(y_train) * sampling_weights[cluster_id])
    
    adasyn = ADASYN(n_neighbors=min(len(y_cluster) - 1, 4), random_state=42)
    X_cluster_resampled, y_cluster_resampled = adasyn.fit_resample(X_cluster, y_cluster)
    
    X_resampled = pd.concat([X_resampled, X_cluster_resampled])
    y_resampled = pd.concat([y_resampled, y_cluster_resampled])

In [22]:
# Step 13: Check Class Distribution After ADASYN
print("\nClass Distribution After ADASYN:")
print(Counter(y_resampled))


Class Distribution After ADASYN:
Counter({0: 28, 1: 24})


In [23]:
# Step 14: Train a Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)

RandomForestClassifier(random_state=42)

In [24]:
# Step 15: Make Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

In [25]:
# Step 16: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

In [26]:
print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.7069
ROC AUC Score: 0.6088


In [27]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(X_resampled, y_resampled)

# Step 15: Make Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Step 16: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.7414
ROC AUC Score: 0.7104


In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, precision_recall_curve, auc
)
from imblearn.metrics import geometric_mean_score

# Step 13: Train a Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)  # Train on resampled data

# Step 14: Make Predictions
y_pred = clf.predict(X_test)  # Predicted labels
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of positive class

# Step 15: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
g_mean = geometric_mean_score(y_test, y_pred)  # Geometric Mean
f1 = f1_score(y_test, y_pred)  # F1-score

# Compute Precision-Recall Curve and PR AUC
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)  # Area Under Precision-Recall Curve

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Geometric Mean: {g_mean:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"PR AUC Score: {pr_auc:.4f}")


Accuracy: 0.7069
ROC AUC Score: 0.6088
Geometric Mean: 0.4813
F1 Score: 0.3704
PR AUC Score: 0.5675
