In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.feature_selection import VarianceThreshold,SelectFromModel
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, confusion_matrix


In [2]:
df=pd.read_csv('E:\\NAAMII\\Machine_learning\\dataset\\train_set.csv')

In [3]:
target = df['CLASS']

In [4]:
features = df.drop(['CLASS', 'ID'], axis=1, errors='ignore')


In [5]:
# Check for infinite and extreme values first
print(f"\nData Quality Check:")
inf_count = np.isinf(features.values).sum()
nan_count = np.isnan(features.values).sum()
print(f"Infinite values: {inf_count}")
print(f"NaN values: {nan_count}")


Data Quality Check:
Infinite values: 4
NaN values: 2668


In [6]:
# Replace infinite values with NaN for statistics calculation
features_clean = features.replace([np.inf, -np.inf], np.nan)
features_clean.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_3229,Feature_3230,Feature_3231,Feature_3232,Feature_3233,Feature_3234,Feature_3235,Feature_3236,Feature_3237,Feature_3238
0,18281.541667,18432.0,9409.650391,0.514708,0.0113,0.045369,2.803803,0.356658,1.803803,564.93625,...,382.968383,382.968383,2214.0,1.0,136.625113,0.06171,0.0,28.154838,4.174959,0.06171
1,20010.083333,20100.0,8303.049072,0.417707,0.014959,0.080294,2.338398,0.429532,1.338398,31.291507,...,452.986164,452.986164,2548.5,1.0,232.564022,0.090548,0.0,27.934229,3.93195,0.090548
2,27260.125,27437.0,12189.649414,0.44716,0.011428,0.046402,2.782842,0.359345,1.782842,11.965643,...,419.781765,419.781765,3400.0,1.0,233.593529,0.068704,0.0,27.904807,4.085035,0.068704
3,41938.125,42138.0,17866.433594,0.426019,0.009908,0.034878,3.060655,0.326727,2.060655,8.966286,...,439.023968,439.023968,5424.0,1.0,427.429572,0.078803,0.0,27.870588,4.011726,0.078803
4,41274.125,41439.0,14315.041992,0.346828,0.013596,0.06568,2.478506,0.403469,1.478506,34.898671,...,485.209184,485.209184,5096.0,1.0,726.731554,0.142608,0.0,28.846909,3.571352,0.142608


In [7]:
features_clean.dropna(axis=1, thresh=0.9 * len(features_clean))

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_3229,Feature_3230,Feature_3231,Feature_3232,Feature_3233,Feature_3234,Feature_3235,Feature_3236,Feature_3237,Feature_3238
0,18281.541667,18432.0,9409.650391,0.514708,0.011300,0.045369,2.803803,0.356658,1.803803,564.936250,...,382.968383,382.968383,2214.0,1.0,136.625113,0.061710,0.0,28.154838,4.174959,0.061710
1,20010.083333,20100.0,8303.049072,0.417707,0.014959,0.080294,2.338398,0.429532,1.338398,31.291507,...,452.986164,452.986164,2548.5,1.0,232.564022,0.090548,0.0,27.934229,3.931950,0.090548
2,27260.125000,27437.0,12189.649414,0.447160,0.011428,0.046402,2.782842,0.359345,1.782842,11.965643,...,419.781765,419.781765,3400.0,1.0,233.593529,0.068704,0.0,27.904807,4.085035,0.068704
3,41938.125000,42138.0,17866.433594,0.426019,0.009908,0.034878,3.060655,0.326727,2.060655,8.966286,...,439.023968,439.023968,5424.0,1.0,427.429572,0.078803,0.0,27.870588,4.011726,0.078803
4,41274.125000,41439.0,14315.041992,0.346828,0.013596,0.065680,2.478506,0.403469,1.478506,34.898671,...,485.209184,485.209184,5096.0,1.0,726.731554,0.142608,0.0,28.846909,3.571352,0.142608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,46787.916667,47002.0,18052.070312,0.385828,0.010883,0.042086,2.874885,0.347840,1.874885,23.499143,...,466.276055,466.276055,6064.0,1.0,585.547823,0.096561,0.0,28.787507,3.894684,0.096561
311,8420.354167,8493.0,4292.039795,0.510004,0.016911,0.101797,2.145061,0.466555,1.145061,25.293867,...,383.044821,383.044821,987.0,1.0,69.155790,0.070369,0.0,29.625473,4.098452,0.070369
312,37262.750000,37407.0,13950.793945,0.374390,0.012759,0.057837,2.585819,0.386725,1.585819,37.474634,...,469.005263,469.005263,4940.0,1.0,505.566802,0.102341,0.0,26.865256,3.815115,0.102341
313,25081.833333,25251.0,11689.275391,0.466045,0.011197,0.044546,2.820962,0.354489,1.820962,18.321132,...,403.597826,403.597826,2944.0,1.0,215.172554,0.073089,0.0,29.180584,4.055504,0.073089


In [8]:
# Impute remaining NaNs with column mean
features_clean.fillna(features_clean.mean(), inplace=True)


In [9]:
# 3. REMOVE ZERO-VARIANCE FEATURES
var_thresh = VarianceThreshold(threshold=0.0)
X_var = var_thresh.fit_transform(features_clean)

In [10]:
# REMOVE HIGHLY CORRELATED FEATURES
def remove_high_correlation_features(X, threshold=0.95):
    corr_matrix = pd.DataFrame(X).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return pd.DataFrame(X).drop(columns=to_drop, axis=1)

X_clean = remove_high_correlation_features(X_var, threshold=0.95)

In [24]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr
from tqdm import tqdm

# Step 1: Compute absolute Spearman correlation
corr = X_clean.corr(method='spearman').abs()

# Step 2: Compute distance matrix (1 - abs(correlation))
distance_matrix = 1 - corr
np.fill_diagonal(distance_matrix.values, 0)

# Step 3: Cluster features (based on correlation distance)
clustering = AgglomerativeClustering(
    metric='precomputed',
    linkage='average',
    distance_threshold=0.05,  # correlation > 0.95 = distance < 0.05
    n_clusters=None
)
cluster_labels = clustering.fit_predict(distance_matrix)

# Step 4: Aggregate by PCA per cluster
X_clustered = []
cluster_ids = np.unique(cluster_labels)

print(f"Number of clusters: {len(cluster_ids)}")

for cluster_id in tqdm(cluster_ids, desc="Processing clusters"):
    # Get all features in this cluster
    feature_mask = cluster_labels == cluster_id
    feature_names = X_clean.columns[feature_mask]
    
    cluster_data = X_clean[feature_names]
    
    if cluster_data.shape[1] == 1:
        # Only one feature in cluster → keep it
        X_clustered.append(cluster_data.values)
    else:
        # Apply PCA to this cluster
        pca = PCA(n_components=1)
        cluster_pc1 = pca.fit_transform(cluster_data)
        X_clustered.append(cluster_pc1)

# Step 5: Final combined dataset
X_cluster_pca = np.hstack(X_clustered)
X_cluster_pca = pd.DataFrame(X_cluster_pca, index=X_clean.index)

print(f"Final shape after cluster PCA: {X_cluster_pca.shape}")


Number of clusters: 603


Processing clusters: 100%|██████████| 603/603 [00:00<00:00, 653.99it/s]

Final shape after cluster PCA: (315, 603)





In [26]:
# 5. SCALING
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster_pca)  # or X_clean if using correlation pruning


In [27]:
# 6. DIMENSIONALITY REDUCTION WITH PCA (retain 95% variance)
pca = PCA(n_components=0.9, random_state=42)
X_pca = pca.fit_transform(X_scaled)

In [28]:
y=df['CLASS']
y

0      0
1      1
2      1
3      0
4      0
      ..
310    0
311    1
312    0
313    1
314    0
Name: CLASS, Length: 315, dtype: int64

In [29]:


# Helper function to calculate all metrics
def evaluate(y_true, y_pred, y_proba):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'Specificity': specificity,
        'AUROC': roc_auc_score(y_true, y_proba)
    }

# Define models
models = {
    "LogisticRegression": LogisticRegression(class_weight='balanced' ,random_state=42,C=0.01,l1_ratio=0.9,solver= 'saga',penalty='elasticnet'),
    "RandomForest": RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42,max_depth=None,min_samples_leaf=2, min_samples_split=5),
    "XGBoost": XGBClassifier(scale_pos_weight=191/124, eval_metric='logloss', random_state=42,learing_rate=0.2, n_estimators=200, max_depth=3, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, verbosity=0),
    "SVM":  SVC(kernel='linear', class_weight='balanced', probability=True, random_state=42,C= 0.001),
    "LightGBM": LGBMClassifier(class_weight='balanced', random_state=42 , learning_rate=0.01, max_depth= 3, n_estimators= 200),
    "NaiveBayes": GaussianNB(var_smoothing= 1e-09)
}

# Cross-validation setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {model_name: [] for model_name in models}

# Loop through models and perform cross-validation
for model_name, model in models.items():
    print(f"\nTraining {model_name}")
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_pca, y)):
        X_train, X_val = X_pca[train_idx], X_pca[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1]
        
        metrics = evaluate(y_val, y_pred, y_proba)
        results[model_name].append(metrics)
        print(f" Fold {fold+1}: " + ", ".join([f"{k}: {v:.4f}" for k, v in metrics.items()]))

# Average results
print("\n📊 Average Cross-Validation Results:")
for model_name, folds in results.items():
    avg = {k: np.mean([fold[k] for fold in folds]) for k in folds[0]}
    print(f" {model_name}: " + ", ".join([f"{k}: {v:.4f}" for k, v in avg.items()]))



Training LogisticRegression
 Fold 1: Accuracy: 0.6190, F1: 0.5714, Recall: 0.6667, Specificity: 0.5897, AUROC: 0.5897
 Fold 2: Accuracy: 0.5079, F1: 0.4746, Recall: 0.5600, Specificity: 0.4737, AUROC: 0.5347
 Fold 3: Accuracy: 0.5397, F1: 0.4912, Recall: 0.5600, Specificity: 0.5263, AUROC: 0.6347
 Fold 4: Accuracy: 0.5714, F1: 0.4906, Recall: 0.5200, Specificity: 0.6053, AUROC: 0.5989
 Fold 5: Accuracy: 0.4921, F1: 0.5556, Recall: 0.8000, Specificity: 0.2895, AUROC: 0.5558

Training RandomForest
 Fold 1: Accuracy: 0.6349, F1: 0.3784, Recall: 0.2917, Specificity: 0.8462, AUROC: 0.6485
 Fold 2: Accuracy: 0.5714, F1: 0.3077, Recall: 0.2400, Specificity: 0.7895, AUROC: 0.6032
 Fold 3: Accuracy: 0.5556, F1: 0.1765, Recall: 0.1200, Specificity: 0.8421, AUROC: 0.4568
 Fold 4: Accuracy: 0.6032, F1: 0.1935, Recall: 0.1200, Specificity: 0.9211, AUROC: 0.6063
 Fold 5: Accuracy: 0.5079, F1: 0.2051, Recall: 0.1600, Specificity: 0.7368, AUROC: 0.5211

Training XGBoost
 Fold 1: Accuracy: 0.6349, F1:



 Fold 2: Accuracy: 0.5873, F1: 0.5000, Recall: 0.5200, Specificity: 0.6316, AUROC: 0.6726
[LightGBM] [Info] Number of positive: 99, number of negative: 153
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3862
[LightGBM] [Info] Number of data points in the train set: 252, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
 Fold 3: Accuracy: 0.4603, F1: 0.3200, Recall: 0.3200, Specificity: 0.5526, AUROC: 0.4316
[LightGBM] [Info] Number of positive: 99, number of negative: 153




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087065 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3860
[LightGBM] [Info] Number of data points in the train set: 252, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
 Fold 4: Accuracy: 0.5714, F1: 0.4706, Recall: 0.4800, Specificity: 0.6316, AUROC: 0.6505




[LightGBM] [Info] Number of positive: 99, number of negative: 153
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3858
[LightGBM] [Info] Number of data points in the train set: 252, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
 Fold 5: Accuracy: 0.4444, F1: 0.4262, Recall: 0.5200, Specificity: 0.3947, AUROC: 0.4937

Training NaiveBayes




 Fold 1: Accuracy: 0.5873, F1: 0.5357, Recall: 0.6250, Specificity: 0.5641, AUROC: 0.5759
 Fold 2: Accuracy: 0.5238, F1: 0.5000, Recall: 0.6000, Specificity: 0.4737, AUROC: 0.5600
 Fold 3: Accuracy: 0.4762, F1: 0.2326, Recall: 0.2000, Specificity: 0.6579, AUROC: 0.5337
 Fold 4: Accuracy: 0.6190, F1: 0.4783, Recall: 0.4400, Specificity: 0.7368, AUROC: 0.6463
 Fold 5: Accuracy: 0.3810, F1: 0.3607, Recall: 0.4400, Specificity: 0.3421, AUROC: 0.3474

📊 Average Cross-Validation Results:
 LogisticRegression: Accuracy: 0.5460, F1: 0.5167, Recall: 0.6213, Specificity: 0.4969, AUROC: 0.5828
 RandomForest: Accuracy: 0.5746, F1: 0.2522, Recall: 0.1863, Specificity: 0.8271, AUROC: 0.5672
 XGBoost: Accuracy: 0.5714, F1: 0.4241, Recall: 0.4037, Specificity: 0.6803, AUROC: 0.5617
 SVM: Accuracy: 0.6127, F1: 0.5507, Recall: 0.6043, Specificity: 0.6175, AUROC: 0.6356
 LightGBM: Accuracy: 0.5333, F1: 0.4413, Recall: 0.4680, Specificity: 0.5754, AUROC: 0.5777
 NaiveBayes: Accuracy: 0.5175, F1: 0.4214, Re