In [1]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sa
# Final step: Model training & evaluation can be done on X_resampled, y_resampled

In [2]:
# Step 2: Load the Dataset
file_path = "breast-cancer.csv"  # Adjust file path if needed
df = pd.read_csv(file_path)

In [3]:
# Step 3: Inspect the Dataset
print("First 5 rows of the dataset:")
print(df.head())  # Display first few rows

First 5 rows of the dataset:
                  Class    Age Menopause Tumor_Size Inv_Nodes Node_Caps  \
0  no-recurrence-events  30-39   premeno      30-34       0-2        no   
1  no-recurrence-events  40-49   premeno      20-24       0-2        no   
2  no-recurrence-events  40-49   premeno      20-24       0-2        no   
3  no-recurrence-events  60-69      ge40      15-19       0-2        no   
4  no-recurrence-events  40-49   premeno        0-4       0-2        no   

   Deg_Malig Breast Breast_Quad Irradiat  
0          3   left    left_low       no  
1          2  right    right_up       no  
2          2   left    left_low       no  
3          2  right     left_up       no  
4          2  right   right_low       no  


In [4]:
print("\nColumn Names:")
print(df.columns)  # Check column names


Column Names:
Index(['Class', 'Age', 'Menopause', 'Tumor_Size', 'Inv_Nodes', 'Node_Caps',
       'Deg_Malig', 'Breast', 'Breast_Quad', 'Irradiat'],
      dtype='object')


In [5]:
# Step 4: Identify the Target Column ("Class")
if 'Class' in df.columns:
    target_column = 'Class'
else:
    raise ValueError("Class column not found! Please check the dataset.")

In [6]:
# Step 1: Encode categorical variables
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

In [7]:
# Step 6: Split Data into Training & Testing Sets
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X)
print(y)

     Age  Menopause  Tumor_Size  Inv_Nodes  Node_Caps  Deg_Malig  Breast  \
0      1          2           5          0          1          3       0   
1      2          2           3          0          1          2       1   
2      2          2           3          0          1          2       0   
3      4          0           2          0          1          2       1   
4      2          2           0          0          1          2       1   
..   ...        ...         ...        ...        ...        ...     ...   
281    1          2           5          0          1          2       0   
282    1          2           3          0          1          3       0   
283    4          0           3          0          1          1       1   
284    2          0           5          1          1          3       0   
285    3          0           5          1          1          3       0   

     Breast_Quad  Irradiat  
0              2         0  
1              5         0  


In [8]:
print(X_train)

     Age  Menopause  Tumor_Size  Inv_Nodes  Node_Caps  Deg_Malig  Breast  \
139    2          2           5          0          1          2       0   
25     2          2           4          0          1          2       0   
82     4          0           2          0          1          2       1   
144    4          0           8          2          2          3       0   
66     2          2           4          0          1          1       1   
..   ...        ...         ...        ...        ...        ...     ...   
188    2          2          10          0          1          2       1   
71     2          2           4          0          1          2       1   
106    2          2           3          0          1          2       0   
270    3          0           5          2          2          3       0   
102    3          2           4          0          1          2       1   

     Breast_Quad  Irradiat  
139            4         0  
25             3         0  


In [9]:
print(X_test.head(5))

     Age  Menopause  Tumor_Size  Inv_Nodes  Node_Caps  Deg_Malig  Breast  \
9      2          2           3          0          1          2       1   
267    4          0           3          5          2          3       0   
143    2          2           8          0          1          2       0   
212    2          2           5          0          1          3       1   
227    3          2           5          0          1          3       1   

     Breast_Quad  Irradiat  
9              3         0  
267            2         1  
143            2         1  
212            5         0  
227            3         1  


In [10]:
print(y_train)

139    0
25     0
82     0
144    0
66     0
      ..
188    0
71     0
106    0
270    1
102    0
Name: Class, Length: 228, dtype: int32


In [11]:
print(y_test)

9      0
267    1
143    0
212    1
227    1
155    0
283    1
73     0
196    0
33     0
185    0
124    0
146    0
265    1
5      0
220    1
233    1
251    1
45     0
268    1
170    0
56     0
200    0
79     0
111    0
275    1
274    1
217    1
109    0
42     0
203    1
84     0
269    1
93     0
46     0
204    1
63     0
75     0
240    1
164    0
181    0
30     0
22     0
24     0
175    0
77     0
147    0
221    1
249    1
92     0
207    1
234    1
177    0
211    1
6      0
116    0
193    0
60     0
Name: Class, dtype: int32


In [12]:
# Step 5: Data Preprocessing (Standardization)
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Step 7: Check Class Imbalance
print("\nClass Distribution Before SMOTE:")
print(Counter(y_train))  # Count of each class in training set


Class Distribution Before SMOTE:
Counter({0: 164, 1: 64})


In [14]:
# Step 8: Apply K-Means Clustering
k = 10  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_train)

In [15]:
print("\n Clusters:",clusters)


 Clusters: [7 9 4 8 7 0 1 0 9 6 0 6 0 4 0 0 8 4 3 0 5 4 2 2 6 4 0 6 3 6 8 1 7 9 3 4 4
 4 7 0 6 0 0 4 9 3 9 7 9 8 7 6 4 9 3 8 0 3 3 4 0 9 1 6 0 6 8 2 3 5 5 0 6 7
 4 2 8 8 4 3 3 3 6 6 6 3 9 2 3 4 4 7 7 4 7 6 8 6 2 0 8 0 9 9 9 4 9 5 7 4 3
 5 0 2 4 6 0 3 6 3 0 6 3 9 3 7 1 2 4 0 9 5 9 3 8 7 3 2 6 4 6 9 4 2 5 9 3 9
 9 3 9 9 9 3 5 5 0 9 3 7 5 6 4 9 4 1 7 8 7 0 4 3 0 3 1 4 0 4 6 6 4 4 5 0 2
 3 5 7 4 4 2 9 0 4 8 6 6 8 1 4 1 4 9 4 4 6 6 7 9 2 4 9 9 0 6 5 6 9 4 6 4 6
 6 3 9 0 8 7]


In [16]:
# Step 9: Filter Clusters with More Minority Instances
df_train_clustered = X_train.copy()
df_train_clustered['cluster'] = clusters
print(df_train_clustered)
df_train_clustered[target_column] = y_train

     Age  Menopause  Tumor_Size  Inv_Nodes  Node_Caps  Deg_Malig  Breast  \
139    2          2           5          0          1          2       0   
25     2          2           4          0          1          2       0   
82     4          0           2          0          1          2       1   
144    4          0           8          2          2          3       0   
66     2          2           4          0          1          1       1   
..   ...        ...         ...        ...        ...        ...     ...   
188    2          2          10          0          1          2       1   
71     2          2           4          0          1          2       1   
106    2          2           3          0          1          2       0   
270    3          0           5          2          2          3       0   
102    3          2           4          0          1          2       1   

     Breast_Quad  Irradiat  cluster  
139            4         0        7  
25         

In [17]:
filtered_clusters = []
for c in np.unique(clusters):
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == c]
    print(c)
    print(cluster_data)
    minority_count = cluster_data[target_column].sum() # Count of minority class
    print(minority_count)
    majority_count = len(cluster_data) - minority_count
    print(majority_count)
    imbalance_ratio = (majority_count + 1) / (minority_count + 1)  # Avoid division by zero
    if imbalance_ratio < 1.5:  # Imbalance ratio threshold
        filtered_clusters.append(c)


0
     Age  Menopause  Tumor_Size  Inv_Nodes  Node_Caps  Deg_Malig  Breast  \
246    1          2           2          2          2          3       0   
90     1          2           0          0          1          2       1   
10     2          2           0          0          1          3       0   
165    2          2           3          1          1          2       1   
16     2          2           2          0          1          2       0   
154    1          1           2          0          1          3       1   
112    2          2           3          0          1          2       1   
219    1          2           2          0          1          1       1   
272    2          2           2          0          2          3       1   
148    1          2           3          0          1          3       0   
31     3          2           2          0          1          2       1   
2      2          2           3          0          1          2       0   
179    2  

In [18]:
print("\nFiltered Clusters:", filtered_clusters)


Filtered Clusters: [5, 8]


In [19]:
import numpy as np
from scipy.spatial.distance import cdist

def compute_sampling_weight(cluster_data, target_column, minority_count, X_train):
    # Extract only feature columns (excluding 'cluster' and target column)
    feature_data = cluster_data.drop(columns=['cluster', target_column]).values

    # Compute pairwise Euclidean distances for all points in the cluster
    pairwise_distances = cdist(feature_data, feature_data, metric='euclidean')

    # Take the mean of the lower triangle of the distance matrix (excluding diagonal)
    avg_minority_dist = np.mean(pairwise_distances[np.triu_indices(len(feature_data), k=1)])

    print(avg_minority_dist)

    # Compute density and sparsity factors
    density_factor = minority_count / (avg_minority_dist ** X_train.shape[1])
    sparsity_factor = 1 / density_factor

    return sparsity_factor

In [20]:
# Compute the total sparsity sum
sparsity_sum = sum(compute_sampling_weight(df_train_clustered[df_train_clustered['cluster'] == f], target_column, len(df_train_clustered[df_train_clustered['cluster'] == f]), X_train) for f in filtered_clusters)

3.5470667989844005
2.851448313223268


In [21]:
# Compute sampling weights
sampling_weights = {
    f: compute_sampling_weight(df_train_clustered[df_train_clustered['cluster'] == f], target_column, len(df_train_clustered[df_train_clustered['cluster'] == f]), X_train) / sparsity_sum
    for f in filtered_clusters
}

print("\nSampling Weights:", sampling_weights)


3.5470667989844005
2.851448313223268

Sampling Weights: {5: 0.8916538216995122, 8: 0.10834617830048776}


In [22]:
# Step 11: Apply SMOTE for Oversampling
X_resampled = pd.DataFrame()
y_resampled = pd.Series(dtype=int)

In [23]:
for cluster_id in filtered_clusters:
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == cluster_id]
    X_cluster = cluster_data.drop(columns=['cluster', target_column])
    y_cluster = cluster_data[target_column]
    
    num_samples = int(len(y_train) * sampling_weights[cluster_id])  # Compute number of samples to generate
    
    smote = SMOTE(k_neighbors=min(3, len(y_cluster) - 1), random_state=42)
    X_cluster_resampled, y_cluster_resampled = smote.fit_resample(X_cluster, y_cluster)
    
    X_resampled = pd.concat([X_resampled, X_cluster_resampled])
    y_resampled = pd.concat([y_resampled, y_cluster_resampled])

In [24]:
# Step 12: Check Class Distribution After SMOTE
print("\nClass Distribution After SMOTE:")
print(Counter(y_resampled))


Class Distribution After SMOTE:
Counter({1: 16, 0: 16})


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Step 13: Train a Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)  # Train on resampled data

# Step 14: Make Predictions
y_pred = clf.predict(X_test)  # Predicted labels
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of positive class

# Step 15: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.7241
ROC AUC Score: 0.6834


In [26]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(X_resampled, y_resampled)

# Step 15: Make Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Step 16: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.7414
ROC AUC Score: 0.7362


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, precision_recall_curve, auc
)
from imblearn.metrics import geometric_mean_score

# Step 13: Train a Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)  # Train on resampled data

# Step 14: Make Predictions
y_pred = clf.predict(X_test)  # Predicted labels
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of positive class

# Step 15: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
g_mean = geometric_mean_score(y_test, y_pred)  # Geometric Mean
f1 = f1_score(y_test, y_pred)  # F1-score

# Compute Precision-Recall Curve and PR AUC
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)  # Area Under Precision-Recall Curve

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Geometric Mean: {g_mean:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"PR AUC Score: {pr_auc:.4f}")


Accuracy: 0.7241
ROC AUC Score: 0.6834
Geometric Mean: 0.6807
F1 Score: 0.6000
PR AUC Score: 0.5983
