In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from imblearn.over_sampling import ADASYN
from collections import Counter
from scipy.spatial.distance import cdist
import seaborn as sa
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
# Step 2: Load the Dataset
file_path = "breast-cancer.csv"  # Adjust file path if needed
df = pd.read_csv(file_path)

In [3]:
# Step 3: Inspect the Dataset
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
                  Class    Age Menopause Tumor_Size Inv_Nodes Node_Caps  \
0  no-recurrence-events  30-39   premeno      30-34       0-2        no   
1  no-recurrence-events  40-49   premeno      20-24       0-2        no   
2  no-recurrence-events  40-49   premeno      20-24       0-2        no   
3  no-recurrence-events  60-69      ge40      15-19       0-2        no   
4  no-recurrence-events  40-49   premeno        0-4       0-2        no   

   Deg_Malig Breast Breast_Quad Irradiat  
0          3   left    left_low       no  
1          2  right    right_up       no  
2          2   left    left_low       no  
3          2  right     left_up       no  
4          2  right   right_low       no  


In [4]:
print("\nColumn Names:")
print(df.columns)


Column Names:
Index(['Class', 'Age', 'Menopause', 'Tumor_Size', 'Inv_Nodes', 'Node_Caps',
       'Deg_Malig', 'Breast', 'Breast_Quad', 'Irradiat'],
      dtype='object')


In [5]:
# Step 4: Identify the Target Column ("Class")
if 'Class' in df.columns:
    target_column = 'Class'
else:
    raise ValueError("Class column not found! Please check the dataset.")

In [6]:
# Step 5: Encode categorical variables
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

In [7]:
# Step 6: Split Data into Training & Testing Sets
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Step 7: Data Preprocessing (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Step 8: Check Class Imbalance
print("\nClass Distribution Before ADASYN:")
print(Counter(y_train))


Class Distribution Before ADASYN:
Counter({0: 164, 1: 64})


In [10]:
# Step 9: Apply K-Means Clustering
k = 10  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_train)

In [11]:
print("\nClusters:", clusters)


Clusters: [7 9 4 8 7 0 1 0 9 6 0 6 0 4 0 0 8 4 3 0 5 4 2 2 6 4 0 6 3 6 8 1 7 9 3 4 4
 4 7 0 6 0 0 4 9 3 9 7 9 8 7 6 4 9 3 8 0 3 3 4 0 9 1 6 0 6 8 2 3 5 5 0 6 7
 4 2 8 8 4 3 3 3 6 6 6 3 9 2 3 4 4 7 7 4 7 6 8 6 2 0 8 0 9 9 9 4 9 5 7 4 3
 5 0 2 4 6 0 3 6 3 0 6 3 9 3 7 1 2 4 0 9 5 9 3 8 7 3 2 6 4 6 9 4 2 5 9 3 9
 9 3 9 9 9 3 5 5 0 9 3 7 5 6 4 9 4 1 7 8 7 0 4 3 0 3 1 4 0 4 6 6 4 4 5 0 2
 3 5 7 4 4 2 9 0 4 8 6 6 8 1 4 1 4 9 4 4 6 6 7 9 2 4 9 9 0 6 5 6 9 4 6 4 6
 6 3 9 0 8 7]


In [12]:
# Step 10: Filter Clusters with More Minority Instances
df_train_clustered = X_train.copy()
df_train_clustered['cluster'] = clusters
df_train_clustered[target_column] = y_train

In [13]:
filtered_clusters = []
for c in np.unique(clusters):
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == c]
    minority_count = cluster_data[target_column].sum()
    majority_count = len(cluster_data) - minority_count
    imbalance_ratio = (majority_count + 1) / (minority_count + 1)
    if imbalance_ratio < 1.5:
        filtered_clusters.append(c)

In [14]:
print("\nFiltered Clusters:", filtered_clusters)


Filtered Clusters: [5, 8]


In [15]:
# Step 11: Compute Sampling Weight for Each Cluster
def compute_sampling_weight(cluster_data, target_column, minority_count, X_train):
    feature_data = cluster_data.drop(columns=['cluster', target_column]).values
    
    # Compute pairwise Euclidean distances
    pairwise_distances = cdist(feature_data, feature_data, metric='euclidean')
    
    # Take the mean of the upper triangle (excluding diagonal)
    avg_minority_dist = np.mean(pairwise_distances[np.triu_indices(len(feature_data), k=1)])
    
    print(f"Cluster {cluster_data['cluster'].iloc[0]} - Avg Euclidean Distance:", avg_minority_dist)
    
    density_factor = minority_count / (avg_minority_dist ** X_train.shape[1])
    sparsity_factor = 1 / density_factor
    
    return sparsity_factor

In [16]:
sparsity_sum = sum(compute_sampling_weight(
    df_train_clustered[df_train_clustered['cluster'] == f], 
    target_column, 
    len(df_train_clustered[df_train_clustered['cluster'] == f]), 
    X_train
) for f in filtered_clusters)

Cluster 5 - Avg Euclidean Distance: 3.5470667989844005
Cluster 8 - Avg Euclidean Distance: 2.851448313223268


In [17]:
sampling_weights = {
    f: compute_sampling_weight(df_train_clustered[df_train_clustered['cluster'] == f], target_column, len(df_train_clustered[df_train_clustered['cluster'] == f]), X_train) / sparsity_sum
    for f in filtered_clusters
}

Cluster 5 - Avg Euclidean Distance: 3.5470667989844005
Cluster 8 - Avg Euclidean Distance: 2.851448313223268


In [18]:
print("\nSampling Weights:", sampling_weights)


Sampling Weights: {5: 0.8916538216995122, 8: 0.10834617830048776}


In [19]:
# Step 12: Apply ADASYN for Oversampling
X_resampled = pd.DataFrame()
y_resampled = pd.Series(dtype=int)

In [20]:
for cluster_id in filtered_clusters:
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == cluster_id]
    X_cluster = cluster_data.drop(columns=['cluster', target_column])
    y_cluster = cluster_data[target_column]
    
    num_samples = int(len(y_train) * sampling_weights[cluster_id])
    
    if len(y_cluster) < 2:  # Skip clusters with fewer than 2 samples
        continue
    
    try:
        adasyn = ADASYN(n_neighbors=min(len(y_cluster) - 1, 3), random_state=42)
        X_cluster_resampled, y_cluster_resampled = adasyn.fit_resample(X_cluster, y_cluster)
        X_resampled = pd.concat([X_resampled, X_cluster_resampled])
        y_resampled = pd.concat([y_resampled, y_cluster_resampled])
    except ValueError:
        print(f"Skipping cluster {cluster_id} due to ADASYN resampling issues.")

Skipping cluster 8 due to ADASYN resampling issues.


In [21]:
# Step 13: Check Class Distribution After ADASYN
print("\nClass Distribution After ADASYN:")
print(Counter(y_resampled))


Class Distribution After ADASYN:
Counter({1: 8, 0: 8})


In [22]:
# Step 14: Train a Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)

RandomForestClassifier(random_state=42)

In [23]:
# Step 15: Make Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

In [24]:
# Step 16: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

In [25]:
print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.6552
ROC AUC Score: 0.7471


In [26]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(X_resampled, y_resampled)

# Step 15: Make Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Step 16: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.6379
ROC AUC Score: 0.6139


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, precision_recall_curve, auc
)
from imblearn.metrics import geometric_mean_score

# Step 13: Train a Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)  # Train on resampled data

# Step 14: Make Predictions
y_pred = clf.predict(X_test)  # Predicted labels
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of positive class

# Step 15: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
g_mean = geometric_mean_score(y_test, y_pred)  # Geometric Mean
f1 = f1_score(y_test, y_pred)  # F1-score

# Compute Precision-Recall Curve and PR AUC
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)  # Area Under Precision-Recall Curve

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Geometric Mean: {g_mean:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"PR AUC Score: {pr_auc:.4f}")


Accuracy: 0.6552
ROC AUC Score: 0.7471
Geometric Mean: 0.6467
F1 Score: 0.5652
PR AUC Score: 0.6795
