In [1]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sa
# Final step: Model training & evaluation can be done on X_resampled, y_resampled

In [2]:
# Step 2: Load the Dataset
df = pd.read_csv('crx.csv')

In [3]:
# Step 3: Inspect the Dataset
print("First 5 rows of the dataset:")
print(df.head())  # Display first few rows

First 5 rows of the dataset:
  Unnamed: 0 Unnamed: 1  Unnamed: 2 Unnamed: 3 A1 A2 A3    A4 A5 A6  A7 A8 A9  \
0          b      30.83       0.000          u  g  w  v  1.25  t  t   1  f  g   
1          a      58.67       4.460          u  g  q  h  3.04  t  t   6  f  g   
2          a       24.5       0.500          u  g  q  h  1.50  t  f   0  f  g   
3          b      27.83       1.540          u  g  w  v  3.75  t  t   5  t  g   
4          b      20.17       5.625          u  g  w  v  1.71  t  f   0  f  s   

   A10  A11 Class  
0  202    0     +  
1   43  560     +  
2  280  824     +  
3  100    3     +  
4  120    0     +  


In [4]:
print("\nColumn Names:")
print(df.columns)  # Check column names


Column Names:
Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'A1', 'A2',
       'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'Class'],
      dtype='object')


In [5]:
# Step 4: Identify the Target Column ("Class")
if 'Class' in df.columns:
    target_column = 'Class'
else:
    raise ValueError("Class column not found! Please check the dataset.")

In [6]:
# Step 1: Encode categorical variables
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

In [7]:
# Step 6: Split Data into Training & Testing Sets
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X)
print(y)

     Unnamed: 0  Unnamed: 1  Unnamed: 2  Unnamed: 3  A1  A2  A3    A4  A5  A6  \
0             2         156       0.000           2   1  13   8  1.25   1   1   
1             1         328       4.460           2   1  11   4  3.04   1   1   
2             1          89       0.500           2   1  11   4  1.50   1   0   
3             2         125       1.540           2   1  13   8  3.75   1   1   
4             2          43       5.625           2   1  13   8  1.71   1   0   
..          ...         ...         ...         ...  ..  ..  ..   ...  ..  ..   
685           2          52      10.085           3   3   5   4  1.25   0   0   
686           1          71       0.750           2   1   2   8  2.00   0   1   
687           1          97      13.500           3   3   6   3  2.00   0   1   
688           2          20       0.205           2   1   1   8  0.04   0   0   
689           2         197       3.375           2   1   2   4  8.29   0   0   

     A7  A8  A9  A10  A11  

In [8]:
print(X_train)

     Unnamed: 0  Unnamed: 1  Unnamed: 2  Unnamed: 3  A1  A2  A3      A4  A5  \
278           2          90      13.500           3   3   6   3   0.000   0   
110           2         137       3.500           2   1  13   8   3.500   1   
82            2         235       0.500           2   1  10   8   0.250   1   
51            2         106       1.000           2   1  11   8   1.750   1   
218           2         308       9.625           2   1   5   8   8.665   1   
..          ...         ...         ...         ...  ..  ..  ..     ...  ..   
71            2         195       4.000           2   1   4   1  12.500   1   
106           2         135       1.165           2   1   9   8   0.500   1   
270           2         217       0.000           0   0   0   0   0.000   0   
435           2          33       0.000           3   3   6   3   0.000   0   
102           2          29       5.000           2   1  11   8   0.375   1   

     A6  A7  A8  A9  A10  A11  
278   0   0   0   0

In [9]:
print(X_test.head(5))

     Unnamed: 0  Unnamed: 1  Unnamed: 2  Unnamed: 3  A1  A2  A3   A4  A5  A6  \
286           1         349         1.5           2   1   6   3  0.0   0   1   
511           1         271         4.0           2   1   8   5  0.0   1   0   
257           2          41         0.0           2   1   4   8  0.5   0   0   
336           2         277         6.5           2   1   2   8  1.0   0   0   
318           2          34         0.0           3   3  10   1  0.0   0   0   

     A7  A8  A9  A10  A11  
286   2   1   0   40  105  
511   0   0   0    1  960  
257   0   0   0   17    0  
336   0   1   0    0  228  
318   0   1   2  136    1  


In [10]:
print(y_train)

278    1
110    1
82     1
51     0
218    0
      ..
71     1
106    1
270    0
435    1
102    1
Name: Class, Length: 552, dtype: int32


In [11]:
print(y_test)

286    1
511    0
257    1
336    1
318    0
      ..
333    1
507    0
24     0
158    0
518    0
Name: Class, Length: 138, dtype: int32


In [12]:
# Step 5: Data Preprocessing (Standardization)
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Step 7: Check Class Imbalance
print("\nClass Distribution Before SMOTE:")
print(Counter(y_train))  # Count of each class in training set


Class Distribution Before SMOTE:
Counter({1: 315, 0: 237})


In [14]:
# Step 8: Apply K-Means Clustering
k = 5  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_train)

In [15]:
print("\n Clusters:",clusters)


 Clusters: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 3 0 0 3 0 0
 3 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0
 0 0 0 0 0 0 0 0 3 0 0 0 0 0 3 0 0 0 0 0 2 0 0 0 0 0 3 0 0 0 0 0 3 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 2 0 0 0 0 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2
 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 2 0 0 0
 0 0 0 0 0 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 3 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 2 0 0 0 0 0 0 0 0
 3 0 0 0 0 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 3 0 3 0 0 0 0 3 0 0 0 0
 0 0 0 0 0 0 

In [16]:
# Step 9: Filter Clusters with More Minority Instances
df_train_clustered = X_train.copy()
df_train_clustered['cluster'] = clusters
print(df_train_clustered)
df_train_clustered[target_column] = y_train

     Unnamed: 0  Unnamed: 1  Unnamed: 2  Unnamed: 3  A1  A2  A3      A4  A5  \
278           2          90      13.500           3   3   6   3   0.000   0   
110           2         137       3.500           2   1  13   8   3.500   1   
82            2         235       0.500           2   1  10   8   0.250   1   
51            2         106       1.000           2   1  11   8   1.750   1   
218           2         308       9.625           2   1   5   8   8.665   1   
..          ...         ...         ...         ...  ..  ..  ..     ...  ..   
71            2         195       4.000           2   1   4   1  12.500   1   
106           2         135       1.165           2   1   9   8   0.500   1   
270           2         217       0.000           0   0   0   0   0.000   0   
435           2          33       0.000           3   3   6   3   0.000   0   
102           2          29       5.000           2   1  11   8   0.375   1   

     A6  A7  A8  A9  A10  A11  cluster  
278   0   

In [17]:
filtered_clusters = []
for c in np.unique(clusters):
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == c]
    print(c)
    print(cluster_data)
    minority_count = cluster_data[target_column].sum() # Count of minority class
    print(minority_count)
    majority_count = len(cluster_data) - minority_count
    print(majority_count)
    imbalance_ratio = (majority_count + 1) / (minority_count + 1)  # Avoid division by zero
    if imbalance_ratio < 1.5:  # Imbalance ratio threshold
        filtered_clusters.append(c)


0
     Unnamed: 0  Unnamed: 1  Unnamed: 2  Unnamed: 3  A1  A2  A3      A4  A5  \
278           2          90      13.500           3   3   6   3   0.000   0   
110           2         137       3.500           2   1  13   8   3.500   1   
82            2         235       0.500           2   1  10   8   0.250   1   
51            2         106       1.000           2   1  11   8   1.750   1   
218           2         308       9.625           2   1   5   8   8.665   1   
..          ...         ...         ...         ...  ..  ..  ..     ...  ..   
71            2         195       4.000           2   1   4   1  12.500   1   
106           2         135       1.165           2   1   9   8   0.500   1   
270           2         217       0.000           0   0   0   0   0.000   0   
435           2          33       0.000           3   3   6   3   0.000   0   
102           2          29       5.000           2   1  11   8   0.375   1   

     A6  A7  A8  A9  A10  A11  cluster  Class  
2

In [18]:
print("\nFiltered Clusters:", filtered_clusters)


Filtered Clusters: [0]


In [19]:
import numpy as np
from scipy.spatial.distance import cdist

def compute_sampling_weight(cluster_data, target_column, minority_count, X_train):
    # Extract only feature columns (excluding 'cluster' and target column)
    feature_data = cluster_data.drop(columns=['cluster', target_column]).values

    # Compute pairwise Euclidean distances for all points in the cluster
    pairwise_distances = cdist(feature_data, feature_data, metric='euclidean')

    # Take the mean of the lower triangle of the distance matrix (excluding diagonal)
    avg_minority_dist = np.mean(pairwise_distances[np.triu_indices(len(feature_data), k=1)])

    print(avg_minority_dist)

    # Compute density and sparsity factors
    density_factor = minority_count / (avg_minority_dist ** X_train.shape[1])
    sparsity_factor = 1 / density_factor

    return sparsity_factor

In [20]:
# Compute the total sparsity sum
sparsity_sum = sum(compute_sampling_weight(df_train_clustered[df_train_clustered['cluster'] == f], target_column, len(df_train_clustered[df_train_clustered['cluster'] == f]), X_train) for f in filtered_clusters)

424.4851716291356


In [21]:
# Compute sampling weights
sampling_weights = {
    f: compute_sampling_weight(df_train_clustered[df_train_clustered['cluster'] == f], target_column, len(df_train_clustered[df_train_clustered['cluster'] == f]), X_train) / sparsity_sum
    for f in filtered_clusters
}

print("\nSampling Weights:", sampling_weights)


424.4851716291356

Sampling Weights: {0: 1.0}


In [22]:
# Step 11: Apply SMOTE for Oversampling
X_resampled = pd.DataFrame()
y_resampled = pd.Series(dtype=int)

In [23]:
for cluster_id in filtered_clusters:
    cluster_data = df_train_clustered[df_train_clustered['cluster'] == cluster_id]
    X_cluster = cluster_data.drop(columns=['cluster', target_column])
    y_cluster = cluster_data[target_column]
    
    num_samples = int(len(y_train) * sampling_weights[cluster_id])  # Compute number of samples to generate
    
    smote = SMOTE(k_neighbors=min(3, len(y_cluster) - 1), random_state=42)
    X_cluster_resampled, y_cluster_resampled = smote.fit_resample(X_cluster, y_cluster)
    
    X_resampled = pd.concat([X_resampled, X_cluster_resampled])
    y_resampled = pd.concat([y_resampled, y_cluster_resampled])

In [24]:
from collections import Counter

print("Before resampling:", Counter(y_train))  # Original class distribution
# print("After resampling:", Counter(y_resampled))  # Resampled class distribution

Before resampling: Counter({1: 315, 0: 237})


In [25]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("After applying SMOTE:", Counter(y_resampled))


After applying SMOTE: Counter({1: 315, 0: 315})


In [26]:
import numpy as np
import pandas as pd

# Check for NaNs
if pd.DataFrame(X_resampled).isnull().values.any():
    print("X_resampled contains NaN values")
    X_resampled = pd.DataFrame(X_resampled).fillna(method="ffill").values  # Fill missing values

# Ensure finite values
X_resampled = np.nan_to_num(X_resampled)  

In [27]:
print(f"X_train shape: {X_train.shape}")

X_train shape: (552, 15)


In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np  # Ensure NumPy is imported

# Step 2: Ensure data is in the correct format
X_resampled = np.array(X_resampled) if not isinstance(X_resampled, np.ndarray) else X_resampled
y_resampled = np.ravel(y_resampled)

X_test = np.array(X_test) if not isinstance(X_test, np.ndarray) else X_test
y_test = np.ravel(y_test)

# Step 3: Train the Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)

# Step 4: Make Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1] if clf.n_classes_ == 2 else clf.predict_proba(X_test)

# Step 5: Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr') if len(np.unique(y_test)) > 2 else roc_auc_score(y_test, y_proba)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")



Accuracy: 0.8551
ROC AUC Score: 0.9069


In [29]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(X_resampled, y_resampled)

# Step 15: Make Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Step 16: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.7609
ROC AUC Score: 0.8282


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, precision_recall_curve, auc
)
from imblearn.metrics import geometric_mean_score

# Step 13: Train a Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)  # Train on resampled data

# Step 14: Make Predictions
y_pred = clf.predict(X_test)  # Predicted labels
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of positive class

# Step 15: Compute Accuracy and ROC AUC Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
g_mean = geometric_mean_score(y_test, y_pred)  # Geometric Mean
f1 = f1_score(y_test, y_pred)  # F1-score

# Compute Precision-Recall Curve and PR AUC
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)  # Area Under Precision-Recall Curve

print(f"\nAccuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Geometric Mean: {g_mean:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"PR AUC Score: {pr_auc:.4f}")


Accuracy: 0.8551
ROC AUC Score: 0.9069
Geometric Mean: 0.8552
F1 Score: 0.8551
PR AUC Score: 0.8457
