In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, confusion_matrix

In [None]:
df=pd.read_csv('E:\\NAAMII\\Machine_learning\\dataset\\train_set.csv')

In [None]:
df.head()

In [None]:
df.head()

In [None]:
target = df['CLASS']

In [None]:
features = df.drop(['CLASS', 'ID'], axis=1, errors='ignore')


In [None]:
# Check for infinite and extreme values first
print(f"\nData Quality Check:")
inf_count = np.isinf(features.values).sum()
nan_count = np.isnan(features.values).sum()
print(f"Infinite values: {inf_count}")
print(f"NaN values: {nan_count}")

In [None]:
# Replace infinite values with NaN for statistics calculation
features_clean = features.replace([np.inf, -np.inf], np.nan)
features_clean.head()

In [None]:
features_clean.dropna(axis=1, thresh=0.9 * len(features_clean))

In [None]:
# Impute remaining NaNs with column mean
features_clean.fillna(features_clean.mean(), inplace=True)


In [None]:
# 3. REMOVE ZERO-VARIANCE FEATURES
var_thresh = VarianceThreshold(threshold=0.0)
X_var = var_thresh.fit_transform(features_clean)

In [None]:
# REMOVE HIGHLY CORRELATED FEATURES
def remove_high_correlation_features(X, threshold=0.95):
    corr_matrix = pd.DataFrame(X).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return pd.DataFrame(X).drop(columns=to_drop, axis=1)

X_clean = remove_high_correlation_features(X_var, threshold=0.95)

In [None]:
X_clean

In [None]:
# 5. SCALING
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_var)  # or X_clean if using correlation pruning

# 6. DIMENSIONALITY REDUCTION WITH PCA (retain 95% variance)
pca = PCA(n_components=0.9, random_state=42)
X_pca = pca.fit_transform(X_scaled)

In [None]:
X_pca.shape

In [None]:
y=df['CLASS']
y

In [None]:


# Helper function to calculate all metrics
def evaluate(y_true, y_pred, y_proba):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'Specificity': specificity,
        'AUROC': roc_auc_score(y_true, y_proba)
    }

# Define models
models = {
    "LogisticRegression": LogisticRegression(class_weight='balanced' ,random_state=42,C=0.01,l1_ratio=0.9,solver= 'saga',penalty='elasticnet'),
    "RandomForest": RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42,max_depth=None,min_samples_leaf=2, min_samples_split=5),
    "XGBoost": XGBClassifier(scale_pos_weight=191/124, eval_metric='logloss', random_state=42,learing_rate=0.2, n_estimators=200, max_depth=3, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, verbosity=0),
    "SVM":  SVC(kernel='linear', class_weight='balanced', probability=True, random_state=42,C= 0.001),
    "LightGBM": LGBMClassifier(class_weight='balanced', random_state=42 , learning_rate=0.01, max_depth= 3, n_estimators= 200),
    "NaiveBayes": GaussianNB(var_smoothing= 1e-09)
}

# Cross-validation setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {model_name: [] for model_name in models}

# Loop through models and perform cross-validation
for model_name, model in models.items():
    print(f"\nTraining {model_name}")
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_pca, y)):
        X_train, X_val = X_pca[train_idx], X_pca[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1]
        
        metrics = evaluate(y_val, y_pred, y_proba)
        results[model_name].append(metrics)
        print(f" Fold {fold+1}: " + ", ".join([f"{k}: {v:.4f}" for k, v in metrics.items()]))

# Average results
print("\n📊 Average Cross-Validation Results:")
for model_name, folds in results.items():
    avg = {k: np.mean([fold[k] for fold in folds]) for k in folds[0]}
    print(f" {model_name}: " + ", ".join([f"{k}: {v:.4f}" for k, v in avg.items()]))


In [None]:
LogisticRegression: Accuracy: 0.5968, F1: 0.5639, Recall: 0.6603, Specificity: 0.5553, AUROC: 0.6514
 RandomForest: Accuracy: 0.5746, F1: 0.3480, Recall: 0.3060, Specificity: 0.7489, AUROC: 0.5858
 XGBoost: Accuracy: 0.5968, F1: 0.4617, Recall: 0.4517, Specificity: 0.6911, AUROC: 0.6042
 SVM: Accuracy: 0.6349, F1: 0.5840, Recall: 0.6447, Specificity: 0.6286, AUROC: 0.6501
 LightGBM: Accuracy: 0.5778, F1: 0.4734, Recall: 0.4843, Specificity: 0.6387, AUROC: 0.5941
 NaiveBayes: Accuracy: 0.5778, F1: 0.4417, Recall: 0.4510, Specificity: 0.6596, AUROC: 0.5943

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


# Build param grid
param_grid = [
    {
        'penalty': ['l1'],
        'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10],
        'solver': ['liblinear'],
        'class_weight': ['balanced']
    },
    {
        'penalty': ['l2'],
        'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10],
        'solver': ['liblinear', 'lbfgs'],
        'class_weight': ['balanced']
    },
    {
        'penalty': ['elasticnet'],
        'C': [0.01, 0.1, 1, 5],
        'solver': ['saga'],
        'l1_ratio': [0.1, 0.5, 0.7, 0.9],
        'class_weight': ['balanced']
    }
]

logreg = LogisticRegression(max_iter=1000, random_state=42)

# Grid search with cross-validation
log_cv = GridSearchCV(
    logreg,
    param_grid,
    scoring='f1',       # Or 'accuracy' if that's your main target
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit
log_cv.fit(X_pca, y)

# Output best model and params
print("Best parameters:", log_cv.best_params_)
print("Best F1 score:", log_cv.best_score_)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_cv = GridSearchCV(rf, rf_grid, scoring='f1', cv=5)
rf_cv.fit(X_pca, y)

print("Best Random Forest params:", rf_cv.best_params_)


In [None]:
from xgboost import XGBClassifier

xgb_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb = XGBClassifier(scale_pos_weight=191/124, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_cv = GridSearchCV(xgb, xgb_grid, scoring='f1', cv=5)
xgb_cv.fit(X_pca, y)

print("Best XGBoost params:", xgb_cv.best_params_)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svm_params = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['linear'],
    'class_weight': ['balanced']
}

svm = SVC(probability=True, random_state=42)
svm_cv = GridSearchCV(svm, svm_params, scoring='f1', cv=5, n_jobs=-1, verbose=2)
svm_cv.fit(X_pca, y)
print("Best SVM params:", svm_cv.best_params_)


In [None]:
from lightgbm import LGBMClassifier

lgbm_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'class_weight': ['balanced']
}

lgbm = LGBMClassifier(random_state=42)
lgbm_cv = GridSearchCV(lgbm, lgbm_params, scoring='f1', cv=5, n_jobs=-1, verbose=2)
lgbm_cv.fit(X_pca, y)
print("Best LGBM params:", lgbm_cv.best_params_)


In [None]:
from sklearn.naive_bayes import GaussianNB

nb_params = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
}

nb = GaussianNB()
nb_cv = GridSearchCV(nb, nb_params, scoring='f1', cv=5, n_jobs=-1, verbose=2)
nb_cv.fit(X_pca, y)
print("Best Naive Bayes params:", nb_cv.best_params_)
