In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from imblearn.over_sampling import ADASYN
from collections import Counter
from scipy.spatial.distance import cdist
import seaborn as sa
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
# Step 2: Load the Dataset
df = pd.read_csv('crx.csv')

In [3]:
# Step 3: Inspect the Dataset
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
  Unnamed: 0 Unnamed: 1  Unnamed: 2 Unnamed: 3 A1 A2 A3    A4 A5 A6  A7 A8 A9  \
0          b      30.83       0.000          u  g  w  v  1.25  t  t   1  f  g   
1          a      58.67       4.460          u  g  q  h  3.04  t  t   6  f  g   
2          a       24.5       0.500          u  g  q  h  1.50  t  f   0  f  g   
3          b      27.83       1.540          u  g  w  v  3.75  t  t   5  t  g   
4          b      20.17       5.625          u  g  w  v  1.71  t  f   0  f  s   

   A10  A11 Class  
0  202    0     +  
1   43  560     +  
2  280  824     +  
3  100    3     +  
4  120    0     +  


In [4]:
print("\nColumn Names:")
print(df.columns)


Column Names:
Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'A1', 'A2',
       'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'Class'],
      dtype='object')


In [5]:
# Step 4: Identify the Target Column ("Class")
if 'Class' in df.columns:
    target_column = 'Class'
else:
    raise ValueError("Class column not found! Please check the dataset.")

In [6]:
# Step 5: Encode categorical variables
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

In [7]:
# Step 6: Split Data into Training & Testing Sets
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Step 7: Data Preprocessing (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Step 8: Check Class Imbalance
print("\nClass Distribution Before ADASYN:")
print(Counter(y_train))


Class Distribution Before ADASYN:
Counter({1: 315, 0: 237})


In [10]:
# Step 9: Apply K-Means Clustering
k = 10  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_train)

In [11]:
print("\nClusters:", clusters)


Clusters: [0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 8 0 0 2 0 0 0 8 8 0 0 0 0 0 5 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 8 0 0 8 0 8 0 0 0 0 8 0 0 0 0 0 2 8 0 0 0 0 5 8 8 3 0 0
 3 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 3 0 0 8 8 0 0 0 0 0 0 0 0 3 0 0 0
 0 0 0 0 0 0 0 0 5 0 0 5 0 0 5 0 8 0 0 0 2 8 0 0 0 0 3 0 0 0 0 0 3 0 8 8 0
 0 0 0 0 0 8 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 5 0 0 3 8 0 0 0 0 8 0 0 0 0 5 5
 0 0 0 0 0 0 0 0 0 6 0 0 0 0 5 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9
 0 0 0 0 3 8 0 0 0 0 0 0 0 0 8 0 0 8 0 0 0 0 0 0 0 0 0 0 0 8 0 8 5 2 0 8 0
 8 0 0 8 0 9 0 0 0 0 0 0 8 9 8 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 8 0 0
 0 0 0 0 8 0 0 0 0 0 7 0 0 0 8 3 0 0 8 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 8 0 0
 8 8 0 0 0 8 0 5 8 0 0 0 0 0 0 5 5 0 0 0 8 0 0 0 0 0 0 0 8 0 8 0 0 0 5 0 8
 5 8 0 5 5 0 0 0 0 0 0 8 0 8 0 0 0 0 0 0 0 3 0 8 8 0 8 0 1 0 0 0 8 0 0 8 0
 0 0 0 8 3 0 0 0 0 0 5 0 0 0 0 0 8 0 0 0 0 0 8 0 0 0 0 5 7 0 0 0 0 8 0 0 0
 3 8 8 0 0 7 0 3 0 8 0 0 0 8 0 0 8 0 0 0 0 0 0 3 0 5 0 3 0 0 0 5 5 0 0 0 0
 8 0 0 0 0 0 8

In [12]:
# Step 10: Filter Clusters with More Minority Instances
df_train_clustered = X_train.copy()
df_train_clustered['cluster'] = clusters
df_train_clustered[target_column] = y_train

In [13]:
filtered_clusters = []
for c in np.unique(clusters):
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == c]
    minority_count = cluster_data[target_column].sum()
    majority_count = len(cluster_data) - minority_count
    imbalance_ratio = (majority_count + 1) / (minority_count + 1)
    if imbalance_ratio < 1.5:
        filtered_clusters.append(c)

In [14]:
print("\nFiltered Clusters:", filtered_clusters)


Filtered Clusters: [0]


In [15]:
# Step 11: Compute Sampling Weight for Each Cluster
def compute_sampling_weight(cluster_data, target_column, minority_count, X_train):
    feature_data = cluster_data.drop(columns=['cluster', target_column]).values
    
    # Compute pairwise Euclidean distances
    pairwise_distances = cdist(feature_data, feature_data, metric='euclidean')
    
    # Take the mean of the upper triangle (excluding diagonal)
    avg_minority_dist = np.mean(pairwise_distances[np.triu_indices(len(feature_data), k=1)])
    
    print(f"Cluster {cluster_data['cluster'].iloc[0]} - Avg Euclidean Distance:", avg_minority_dist)
    
    density_factor = minority_count / (avg_minority_dist ** X_train.shape[1])
    sparsity_factor = 1 / density_factor
    
    return sparsity_factor

In [16]:
sparsity_sum = sum(compute_sampling_weight(
    df_train_clustered[df_train_clustered['cluster'] == f], 
    target_column, 
    len(df_train_clustered[df_train_clustered['cluster'] == f]), 
    X_train
) for f in filtered_clusters)

Cluster 0 - Avg Euclidean Distance: 181.47762984596065


In [17]:
sampling_weights = {
    f: compute_sampling_weight(df_train_clustered[df_train_clustered['cluster'] == f], target_column, len(df_train_clustered[df_train_clustered['cluster'] == f]), X_train) / sparsity_sum
    for f in filtered_clusters
}

Cluster 0 - Avg Euclidean Distance: 181.47762984596065


In [18]:
print("\nSampling Weights:", sampling_weights)


Sampling Weights: {0: 1.0}


In [19]:
# Step 12: Apply ADASYN for Oversampling
X_resampled = pd.DataFrame()
y_resampled = pd.Series(dtype=int)

In [20]:
for cluster_id in filtered_clusters:
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == cluster_id]
    X_cluster = cluster_data.drop(columns=['cluster', target_column])
    y_cluster = cluster_data[target_column]
    
    num_samples = int(len(y_train) * sampling_weights[cluster_id])
    
    if len(y_cluster) < 2:  # Skip clusters with fewer than 2 samples
        continue
    
    try:
        adasyn = ADASYN(n_neighbors=min(len(y_cluster) - 1, 3), random_state=42)
        X_cluster_resampled, y_cluster_resampled = adasyn.fit_resample(X_cluster, y_cluster)
        X_resampled = pd.concat([X_resampled, X_cluster_resampled])
        y_resampled = pd.concat([y_resampled, y_cluster_resampled])
    except ValueError:
        print(f"Skipping cluster {cluster_id} due to ADASYN resampling issues.")

In [21]:
# Step 13: Check Class Distribution After ADASYN
print("\nClass Distribution After ADASYN:")
print(Counter(y_resampled))


Class Distribution After ADASYN:
Counter({0: 300, 1: 286})


In [22]:
# Step 14: Train a Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)

RandomForestClassifier(random_state=42)

In [23]:
# Step 15: Make Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

In [24]:
# Step 16: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

In [25]:
print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.8188
ROC AUC Score: 0.8925


In [26]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(X_resampled, y_resampled)

# Step 15: Make Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Step 16: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.7899
ROC AUC Score: 0.8284


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, precision_recall_curve, auc
)
from imblearn.metrics import geometric_mean_score

# Step 13: Train a Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)  # Train on resampled data

# Step 14: Make Predictions
y_pred = clf.predict(X_test)  # Predicted labels
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of positive class

# Step 15: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
g_mean = geometric_mean_score(y_test, y_pred)  # Geometric Mean
f1 = f1_score(y_test, y_pred)  # F1-score

# Compute Precision-Recall Curve and PR AUC
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)  # Area Under Precision-Recall Curve

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Geometric Mean: {g_mean:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"PR AUC Score: {pr_auc:.4f}")


Accuracy: 0.8188
ROC AUC Score: 0.8925
Geometric Mean: 0.8189
F1 Score: 0.8201
PR AUC Score: 0.8468
