In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# Imported Libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv('/kaggle/input/loan-approval-classification-data/loan_data.csv')
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Dense, Layer
from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_curve, auc, classification_report
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Custom Attention Layer
class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()

    def build(self, input_shape):
        self.W = self.add_weight(shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(shape=(input_shape[1],), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, inputs):
        # Attention mechanism
        score = K.dot(inputs, self.W)  # (batch_size, timesteps, features) -> (batch_size, timesteps, 1)
        score = K.reshape(score, (-1, inputs.shape[1]))  # (batch_size, timesteps)
        score = K.tanh(score + self.b)  # Apply tanh + bias term
        score = K.softmax(score)  # Normalize with softmax to get attention weights
        score = K.reshape(score, (-1, inputs.shape[1]))  # Normalize across timesteps

        context_vector = K.batch_dot(score, inputs, axes=[1, 1])  # Weighted sum of inputs
        return context_vector

# Load your data (Assuming df is already defined as your dataset)
# df = pd.read_csv('your_dataset.csv') # Uncomment and modify with your actual dataset

# Preprocess numerical features (standardization)
numerical_features = [
    "person_age", "person_income", "person_emp_exp", "loan_amnt",
    "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length", "credit_score"
]

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Tokenizing and padding text columns (if applicable)
text_columns = [
    "person_gender", "person_education", "person_home_ownership",
    "loan_intent", "previous_loan_defaults_on_file"
]

X_text_list = []

for text_column in text_columns:
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(df[text_column])
    X_text = tokenizer.texts_to_sequences(df[text_column])
    X_text = pad_sequences(X_text, padding='post', maxlen=100)  # Adjust maxlen as needed
    X_text_list.append(X_text)

# Concatenate all the padded text columns
X_text_combined = np.concatenate(X_text_list, axis=1)

# Concatenate text data with numerical features
X = np.concatenate([X_text_combined, df[numerical_features].values], axis=1)
y = df['loan_status'].values  # Replace with your target column (binary classification)

# Stratified KFold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Prepare a list to store results for each fold
results = []

# Initialize lists to store metrics across folds
accuracies = []
f1_scores = []
precisions = []
recalls = []

# Iterate over the folds
for train_index, val_index in kf.split(X, y):
    # Split data into train and validation sets for this fold
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    # Reshape the data for LSTM input (3D input: [samples, timesteps, features])
    X_train_fold = np.reshape(X_train_fold, (X_train_fold.shape[0], 1, X_train_fold.shape[1]))
    X_val_fold = np.reshape(X_val_fold, (X_val_fold.shape[0], 1, X_val_fold.shape[1]))

    # Building the Model
    model = Sequential()

    # First Bidirectional LSTM layer
    model.add(Bidirectional(LSTM(32, return_sequences=True), input_shape=(X_train_fold.shape[1], X_train_fold.shape[2])))
    model.add(Dropout(0.5))  # Dropout layer

    # Add Attention layer
    model.add(Attention())

    # Dense layer (replacing the second Bidirectional LSTM layer)
    model.add(Dense(16, activation='relu'))

    # Final output layer with sigmoid activation (binary classification)
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer=RMSprop(), loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=64, validation_data=(X_val_fold, y_val_fold), verbose=0)

    # Evaluate the model on the validation set
    y_pred_fold = model.predict(X_val_fold)
    y_pred_fold = (y_pred_fold > 0.5).astype(int)

    # Calculate metrics (accuracy, F1-score, precision, recall)
    accuracy = accuracy_score(y_val_fold, y_pred_fold)
    f1 = f1_score(y_val_fold, y_pred_fold)
    precision = precision_score(y_val_fold, y_pred_fold)
    recall = recall_score(y_val_fold, y_pred_fold)

    # Store the results for this fold
    accuracies.append(accuracy)
    f1_scores.append(f1)
    precisions.append(precision)
    recalls.append(recall)

# Calculate the average performance metrics across all folds
avg_accuracy = np.mean(accuracies)
avg_f1 = np.mean(f1_scores)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)

print(f'Average Accuracy: {avg_accuracy:.4f}')
print(f'Average F1-Score: {avg_f1:.4f}')
print(f'Average Precision: {avg_precision:.4f}')
print(f'Average Recall: {avg_recall:.4f}')

# Final evaluation on the test set (optional)
X_final = np.reshape(X, (X.shape[0], 1, X.shape[1]))  # Reshape X for final evaluation
y_final = y  # Use the entire dataset as the final evaluation set

y_pred = model.predict(X_final)
y_pred = (y_pred > 0.5).astype(int)

# Confusion Matrix
cm = confusion_matrix(y_final, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
print("\nClassification Report:\n", classification_report(y_final, y_pred))

# ROC Curve
fpr, tpr, _ = roc_curve(y_final, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Optionally, show model summary
model.summary()


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Dense, Layer
from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, auc, classification_report
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Custom Attention Layer
class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()

    def build(self, input_shape):
        self.W = self.add_weight(shape=(input_shape[-1], 1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(shape=(input_shape[1],), initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, inputs):
        # Attention mechanism
        score = K.dot(inputs, self.W)  # (batch_size, timesteps, features) -> (batch_size, timesteps, 1)
        score = K.reshape(score, (-1, inputs.shape[1]))  # (batch_size, timesteps)
        score = K.tanh(score + self.b)  # Apply tanh + bias term
        score = K.softmax(score)  # Normalize with softmax to get attention weights
        score = K.reshape(score, (-1, inputs.shape[1]))  # Normalize across timesteps

        context_vector = K.batch_dot(score, inputs, axes=[1, 1])  # Weighted sum of inputs
        return context_vector

# Load your data (Assuming df is already defined as your dataset)
# df = pd.read_csv('your_dataset.csv') # Uncomment and modify with your actual dataset

# Preprocess numerical features (standardization)
numerical_features = [
    "person_age", "person_income", "person_emp_exp", "loan_amnt",
    "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length", "credit_score"
]

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Tokenizing and padding text columns (if applicable)
text_columns = [
    "person_gender", "person_education", "person_home_ownership",
    "loan_intent", "previous_loan_defaults_on_file"
]

X_text_list = []

for text_column in text_columns:
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(df[text_column])
    X_text = tokenizer.texts_to_sequences(df[text_column])
    X_text = pad_sequences(X_text, padding='post', maxlen=100)  # Adjust maxlen as needed
    X_text_list.append(X_text)

# Concatenate all the padded text columns
X_text_combined = np.concatenate(X_text_list, axis=1)

# Concatenate text data with numerical features
X = np.concatenate([X_text_combined, df[numerical_features].values], axis=1)
y = df['loan_status'].values  # Replace with your target column (binary classification)

# Stratified KFold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Prepare a list to store results for each fold
results = []

# Iterate over the folds
for train_index, val_index in kf.split(X, y):
    # Split data into train and validation sets for this fold
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    # Reshape the data for LSTM input (3D input: [samples, timesteps, features])
    X_train_fold = np.reshape(X_train_fold, (X_train_fold.shape[0], 1, X_train_fold.shape[1]))
    X_val_fold = np.reshape(X_val_fold, (X_val_fold.shape[0], 1, X_val_fold.shape[1]))

    # Building the Model
    model = Sequential()

    # First Bidirectional LSTM layer
    model.add(Bidirectional(LSTM(32, return_sequences=True), input_shape=(X_train_fold.shape[1], X_train_fold.shape[2])))
    model.add(Dropout(0.5))  # Dropout layer

    # Add Attention layer
    model.add(Attention())

    # Dense layer (replacing the second Bidirectional LSTM layer)
    model.add(Dense(16, activation='relu'))

    # Final output layer with sigmoid activation (binary classification)
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer=RMSprop(), loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=64, validation_data=(X_val_fold, y_val_fold), verbose=0)

    # Evaluate the model on the validation set
    y_pred_fold = model.predict(X_val_fold)
    y_pred_fold = (y_pred_fold > 0.5).astype(int)

    # Calculate metrics (accuracy, F1-score)
    accuracy = accuracy_score(y_val_fold, y_pred_fold)
    f1 = f1_score(y_val_fold, y_pred_fold)

    # Store the results
    results.append({'accuracy': accuracy, 'f1': f1})

# Calculate the average performance metrics across all folds
avg_accuracy = np.mean([result['accuracy'] for result in results])
avg_f1 = np.mean([result['f1'] for result in results])

print(f'Average Accuracy: {avg_accuracy:.4f}')
print(f'Average F1-Score: {avg_f1:.4f}')

# Optionally, you can visualize results of confusion matrix or ROC curve on any fold or average
# Final evaluation of the model (on the last fold, for instance)
y_pred = model.predict(X_val_fold)
y_pred = (y_pred > 0.5).astype(int)

# Confusion Matrix
cm = confusion_matrix(y_val_fold, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
print("\nClassification Report:\n", classification_report(y_val_fold, y_pred))

# ROC Curve
fpr, tpr, _ = roc_curve(y_val_fold, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Optionally, show model summary
model.summary()


In [None]:
y_pred1 = model.predict(X_train)
y_pred1 = (y_pred1 > 0.5).astype(int)

In [None]:
cm = confusion_matrix(y_train, y_pred1)
print("Confusion Matrix:\n", cm)

In [None]:
# Classification Report
print("\nClassification Report:\n", classification_report(y_train, y_pred1))

In [None]:
df.describe()

In [None]:

print(df.isnull().sum().max())

In [2]:
print(df.columns)
print(df.shape[0])

Index(['person_age', 'person_gender', 'person_education', 'person_income',
       'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'previous_loan_defaults_on_file', 'loan_status'],
      dtype='object')
45000


In [None]:
# The classes are heavily skewed we need to solve this issue later.
print('Approved', round(df['loan_status'].value_counts()[1]/len(df) * 100,2), '% of the dataset')
print('Rejected', round(df['loan_status'].value_counts()[0]/len(df) * 100,2), '% of the dataset')

In [None]:
colors = ["#0101DF", "#DF0101"]

sns.countplot(x='loan_status', data=df, palette=colors)
plt.title('Class Distributions \n (0: Rejetced || 1: Approved)', fontsize=14)
plt.show()

In [3]:
import pandas as pd
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



# Define numerical and categorical columns
numerical_cols = [
    "person_age", "person_income", "person_emp_exp", "loan_amnt",
    "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length", "credit_score"
]
categorical_cols = [
    "person_gender", "person_education", "person_home_ownership",
    "loan_intent", "previous_loan_defaults_on_file"
]

# Scaling numerical features
scaler = RobustScaler()

# One-hot encoding categorical features
encoder = OneHotEncoder(drop='first', sparse=False)

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("num", scaler, numerical_cols),
        ("cat", encoder, categorical_cols)
    ]
)

# Apply transformations
preprocessed_data = preprocessor.fit_transform(df)

# Get feature names for the categorical columns
encoded_columns = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)

# Combine scaled and encoded features into a DataFrame
new_df = pd.DataFrame(
    preprocessed_data,
    columns=numerical_cols + list(encoded_columns)
)

# Include the target variable
new_df["loan_status"] = df["loan_status"].values

new_df.head()


Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,person_gender_male,person_education_Bachelor,...,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_Yes,loan_status
0,-0.666667,0.100854,-0.571429,3.730699,1.138636,3.083333,-0.2,-1.144928,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
1,-0.833333,-1.127215,-0.571429,-0.967218,0.029545,-0.333333,-0.4,-1.971014,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
2,-0.166667,-1.124004,-0.142857,-0.345435,0.422727,2.666667,-0.2,-0.072464,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
3,-0.5,0.261499,-0.571429,3.730699,0.959091,2.666667,-0.4,0.507246,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1
4,-0.333333,-0.018792,-0.428571,3.730699,0.740909,3.416667,0.0,-0.782609,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1


In [4]:
from sklearn.model_selection import train_test_split


X = new_df.drop('loan_status', axis=1)  # Drop target column
y = new_df['loan_status']  # Target column

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y )

In [None]:

# Calculate Q1, Q3, and IQR for numerical columns
Q1 = X_train[numerical_cols].quantile(0.25)
Q3 = X_train[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter rows that fall within the thresholds for all numerical columns
X_cleared = X_train[
    ~((X_train[numerical_cols] < lower_bound) | (X_train[numerical_cols] > upper_bound)).any(axis=1)
]
# Ensure indices are aligned with the original DataFrame
y_cleared = y_train.loc[X_cleared.index].reset_index(drop=True)



In [None]:
y_cleared.isnull().sum().max()

In [None]:
# Verify the shapes match
print(f"Shape of X_cleared: {X_cleared.shape}")
print(f"Shape of y_train_cleared: {y_cleared.shape}")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Define the categorical columns
categorical_cols = [
    "person_gender", "person_education", "person_home_ownership",
    "loan_intent", "previous_loan_defaults_on_file"
]

# Initialize LabelEncoder
encoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(
    correlation_matrix,
    annot=False,       # Disable annotations inside the squares
    cmap="coolwarm",  # Color map for the heatmap
    cbar=True         # Display color bar
)

plt.title("Correlation Matrix ", fontsize=16)
plt.show()


In [None]:
new_df.shape[1]

In [None]:
print(new_df.isnull().sum().max())

In [None]:
# # Store best parameters for each model
# best_params = {}

# # Optimize each model using Optuna
# study_logistic = optuna.create_study(direction='maximize')
# study_logistic.optimize(objective_logistic, n_trials=50)
# best_params['LogisticRegression'] = study_logistic.best_trial.params

# study_rf = optuna.create_study(direction='maximize')
# study_rf.optimize(objective_rf, n_trials=50)
# best_params['RandomForest'] = study_rf.best_trial.params

# study_adaboost = optuna.create_study(direction='maximize')
# study_adaboost.optimize(objective_adaboost, n_trials=50)
# best_params['AdaBoost'] = study_adaboost.best_trial.params

# study_gb = optuna.create_study(direction='maximize')
# study_gb.optimize(objective_gb, n_trials=50)
# best_params['GradientBoosting'] = study_gb.best_trial.params

# study_xgb = optuna.create_study(direction='maximize')
# study_xgb.optimize(objective_xgb, n_trials=50)
# best_params['XGBoost'] = study_xgb.best_trial.params

# study_lgb = optuna.create_study(direction='maximize')
# study_lgb.optimize(objective_lgb, n_trials=50)
# best_params['LightGBM'] = study_lgb.best_trial.params

# study_catboost = optuna.create_study(direction='maximize')
# study_catboost.optimize(objective_catboost, n_trials=50)
# best_params['CatBoost'] = study_catboost.best_trial.params

# # Print the best parameters
# for model_name, params in best_params.items():
#     print(f"Best Parameters for {model_name}: {params}")


# with outliers


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import lightgbm as lgb
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Initialize StratifiedKFold for cross-validation
sss = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Best parameters from Bayesian search
classifiers = {
    'LogisticRegression': LogisticRegression(penalty='l1', C=0.2153901105439295, solver='saga'),
    'RandomForest': RandomForestClassifier(n_estimators=132, max_depth=15, min_samples_split=3),
    'AdaBoost': AdaBoostClassifier(n_estimators=143, learning_rate=0.9536417586026401),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=145, learning_rate=0.19587059159994072, max_depth=5),
    'XGBoost': XGBClassifier(n_estimators=150, learning_rate=0.2236622678900941, max_depth=5),
    'LightGBM': lgb.LGBMClassifier(n_estimators=134, learning_rate=0.1417403897878078, max_depth=9),
    'CatBoost': CatBoostClassifier(iterations=149, learning_rate=0.28167281754280704, depth=6, verbose=0)
}

# Sampling techniques
sampling_techniques = [
    ('SMOTE', SMOTE(sampling_strategy='minority')),
    ('ADASYN', ADASYN(sampling_strategy='minority')),
    ('TomekLinks', TomekLinks(sampling_strategy='majority')),
    ('NearMiss', NearMiss(sampling_strategy='majority'))
]

model_results = {}

# Train and evaluate classifiers
for model_name, model in classifiers.items():
    print(f"Training and Evaluating {model_name}...")

    # Store results for each sampling technique
    test_results = []

    # Loop through sampling techniques
    for sampler_name, sampler in sampling_techniques:
        print(f"Applying {sampler_name}...")

        # Initialize list to hold metrics for each fold
        fold_results = []

        # Stratified K-Fold Cross-validation
        for train_index, val_index in sss.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            # Create pipeline
            pipeline = imbalanced_make_pipeline(sampler, model)

            # Fit the pipeline on the training fold
            pipeline.fit(X_train_fold, y_train_fold)

            # Predict on validation fold
            predictions_val = pipeline.predict(X_val_fold)

            # Compute metrics for the validation fold
            val_accuracy = pipeline.score(X_val_fold, y_val_fold)
            val_precision = precision_score(y_val_fold, predictions_val)
            val_recall = recall_score(y_val_fold, predictions_val)
            val_f1 = f1_score(y_val_fold, predictions_val)
            val_auc = roc_auc_score(y_val_fold, pipeline.predict_proba(X_val_fold)[:, 1])

            # Store fold results
            fold_results.append({
                'Accuracy': val_accuracy,
                'Precision': val_precision,
                'Recall': val_recall,
                'F1': val_f1,
                'AUC': val_auc
            })

        # Calculate average performance metrics across all folds for this sampling technique
        avg_accuracy = np.mean([result['Accuracy'] for result in fold_results])
        avg_precision = np.mean([result['Precision'] for result in fold_results])
        avg_recall = np.mean([result['Recall'] for result in fold_results])
        avg_f1 = np.mean([result['F1'] for result in fold_results])
        avg_auc = np.mean([result['AUC'] for result in fold_results])

        # Append results for this sampling technique
        test_results.append({
            'Sampling Technique': sampler_name,
            'Accuracy': avg_accuracy,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1,
            'AUC': avg_auc
        })

    # Save results for the model
    model_results[model_name] = pd.DataFrame(test_results)

# Print test set performance metrics
print("\n--- Cross-Validation Performance Metrics ---")
for model_name, df in model_results.items():
    print(f"\nResults for {model_name}:")
    print(df[['Sampling Technique', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC']].to_string(index=False))


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import lightgbm as lgb
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Initialize StratifiedKFold for cross-validation
sss = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Best parameters from Bayesian search
classifiers = {
    'LogisticRegression': LogisticRegression(penalty='l1', C=0.2153901105439295, solver='saga'),
    'RandomForest': RandomForestClassifier(n_estimators=132, max_depth=15, min_samples_split=3),
    'AdaBoost': AdaBoostClassifier(n_estimators=143, learning_rate=0.9536417586026401),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=145, learning_rate=0.19587059159994072, max_depth=5),
    'XGBoost': XGBClassifier(n_estimators=150, learning_rate=0.2236622678900941, max_depth=5),
    'LightGBM': lgb.LGBMClassifier(n_estimators=134, learning_rate=0.1417403897878078, max_depth=9),
    'CatBoost': CatBoostClassifier(iterations=149, learning_rate=0.28167281754280704, depth=6, verbose=0)
}

# Sampling techniques
sampling_techniques = [
    ('SMOTE', SMOTE(sampling_strategy='minority')),
    ('ADASYN', ADASYN(sampling_strategy='minority')),
    ('TomekLinks', TomekLinks(sampling_strategy='majority')),
    ('NearMiss', NearMiss(sampling_strategy='majority'))
]

model_results = {}

# Train and evaluate classifiers
for model_name, model in classifiers.items():
    print(f"Training and Evaluating {model_name}...")

    # Store results for each sampling technique
    test_results = []

    # Loop through sampling techniques
    for sampler_name, sampler in sampling_techniques:
        print(f"Applying {sampler_name}...")

        # Initialize list to hold metrics for each fold
        fold_results = []

        # Stratified K-Fold Cross-validation
        for train_index, val_index in sss.split(X, y):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            # Create pipeline
            pipeline = imbalanced_make_pipeline(sampler, model)

            # Fit the pipeline on the training fold
            pipeline.fit(X_train_fold, y_train_fold)

            # Predict on validation fold
            predictions_val = pipeline.predict(X_val_fold)

            # Compute metrics for the validation fold
            val_accuracy = pipeline.score(X_val_fold, y_val_fold)
            val_precision = precision_score(y_val_fold, predictions_val)
            val_recall = recall_score(y_val_fold, predictions_val)
            val_f1 = f1_score(y_val_fold, predictions_val)
            val_auc = roc_auc_score(y_val_fold, pipeline.predict_proba(X_val_fold)[:, 1])

            # Store fold results
            fold_results.append({
                'Accuracy': val_accuracy,
                'Precision': val_precision,
                'Recall': val_recall,
                'F1': val_f1,
                'AUC': val_auc
            })

        # Calculate average performance metrics across all folds for this sampling technique
        avg_accuracy = np.mean([result['Accuracy'] for result in fold_results])
        avg_precision = np.mean([result['Precision'] for result in fold_results])
        avg_recall = np.mean([result['Recall'] for result in fold_results])
        avg_f1 = np.mean([result['F1'] for result in fold_results])
        avg_auc = np.mean([result['AUC'] for result in fold_results])

        # Append results for this sampling technique
        test_results.append({
            'Sampling Technique': sampler_name,
            'Accuracy': avg_accuracy,
            'Precision': avg_precision,
            'Recall': avg_recall,
            'F1': avg_f1,
            'AUC': avg_auc
        })

    # Save results for the model
    model_results[model_name] = pd.DataFrame(test_results)

# Print test set performance metrics
print("\n--- Cross-Validation Performance Metrics ---")
for model_name, df in model_results.items():
    print(f"\nResults for {model_name}:")
    print(df[['Sampling Technique', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC']].to_string(index=False))


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import lightgbm as lgb
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Initialize StratifiedKFold for cross-validation
sss = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Best parameters from Bayesian search
classifiers = {
    'LogisticRegression': LogisticRegression(penalty='l1', C=0.2153901105439295, solver='saga'),
    'RandomForest': RandomForestClassifier(n_estimators=132, max_depth=15, min_samples_split=3),
    'AdaBoost': AdaBoostClassifier(n_estimators=143, learning_rate=0.9536417586026401),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=145, learning_rate=0.19587059159994072, max_depth=5),
    'XGBoost': XGBClassifier(n_estimators=150, learning_rate=0.2236622678900941, max_depth=5),
    'LightGBM': lgb.LGBMClassifier(n_estimators=134, learning_rate=0.1417403897878078, max_depth=9),
    'CatBoost': CatBoostClassifier(iterations=149, learning_rate=0.28167281754280704, depth=6, verbose=0)
}

# Sampling techniques
sampling_techniques = [
    ('SMOTE', SMOTE(sampling_strategy='minority')),
    ('ADASYN', ADASYN(sampling_strategy='minority')),
    ('TomekLinks', TomekLinks(sampling_strategy='majority')),
    ('NearMiss', NearMiss(sampling_strategy='majority'))
   
]

model_results = {}

# Train and evaluate classifiers
for model_name, model in classifiers.items():
    print(f"Training and Evaluating {model_name}...")

    # Store results for test set
    test_results = []

    # Loop through sampling techniques
    for sampler_name, sampler in sampling_techniques:
        print(f"Applying {sampler_name}...")

        # Create pipeline
        pipeline = imbalanced_make_pipeline(sampler, model)
        pipeline.fit(X_train, y_train)

        # Predict on test set
        predictions_test = pipeline.predict(X_test)

        # Compute metrics for the test set
        test_accuracy = pipeline.score(X_test, y_test)
        test_precision = precision_score(y_test, predictions_test)
        test_recall = recall_score(y_test, predictions_test)
        test_f1 = f1_score(y_test, predictions_test)
        test_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])

        # Append results for test set
        test_results.append({
            'Sampling Technique': sampler_name,
            'Accuracy': test_accuracy,
            'Precision': test_precision,
            'Recall': test_recall,
            'F1': test_f1,
            'AUC': test_auc
        })

    # Save results for each classifier
    model_results[model_name] = pd.DataFrame(test_results)

# Print test set performance metrics
print("\n--- Test Set Performance Metrics ---")
for model_name, df in model_results.items():
    print(f"\nResults for {model_name}:")
    print(df[['Sampling Technique', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC']].to_string(index=False))


# without outliers

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import lightgbm as lgb
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Classifiers with best parameters (from Bayesian search)
classifiers = {
    'LogisticRegression': LogisticRegression(penalty='l1', C=0.2153901105439295, solver='saga'),
    'RandomForest': RandomForestClassifier(n_estimators=132, max_depth=15, min_samples_split=3),
    'AdaBoost': AdaBoostClassifier(n_estimators=143, learning_rate=0.9536417586026401),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=145, learning_rate=0.19587059159994072, max_depth=5),
    'XGBoost': XGBClassifier(n_estimators=150, learning_rate=0.2236622678900941, max_depth=5),
    'LightGBM': lgb.LGBMClassifier(n_estimators=134, learning_rate=0.1417403897878078, max_depth=9),
    'CatBoost': CatBoostClassifier(iterations=149, learning_rate=0.28167281754280704, depth=6, verbose=0)
}

# Sampling techniques
sampling_techniques = [
    ('SMOTE', SMOTE(sampling_strategy='minority')),
    ('ADASYN', ADASYN(sampling_strategy='minority')),
    ('TomekLinks', TomekLinks(sampling_strategy='majority')),
    ('NearMiss', NearMiss(sampling_strategy='majority')),
    ('SMOTEENN', SMOTEENN(sampling_strategy='auto'))
]

# Initialize a dictionary to store results for each model
model_results = {}

# Train and evaluate classifiers
for model_name, model in classifiers.items():
    print(f"Training and Evaluating {model_name}...")

    # Store results for the model
    test_results = []

    # Loop through sampling techniques
    for sampler_name, sampler in sampling_techniques:
        print(f"Applying {sampler_name}...")

        # Create the pipeline combining the sampling technique and the classifier
        pipeline = imbalanced_make_pipeline(sampler, model)

        # Fit the pipeline on the entire dataset
        pipeline.fit(X_cleared, y_cleared)  # Assuming X_cleared and y_cleared are your training data

        # Predict on the test set
        predictions_test = pipeline.predict(X_test)  # Assuming X_test is your test data

        # Compute metrics for the test set
        test_accuracy = pipeline.score(X_test, y_test)  # Assuming y_test is your test labels
        test_precision = precision_score(y_test, predictions_test)
        test_recall = recall_score(y_test, predictions_test)
        test_f1 = f1_score(y_test, predictions_test)
        test_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])

        # Store the results for the test set
        test_results.append({
            'Sampling Technique': sampler_name,
            'Accuracy': test_accuracy,
            'Precision': test_precision,
            'Recall': test_recall,
            'F1': test_f1,
            'AUC': test_auc
        })

    # Store the results for the model
    model_results[model_name] = pd.DataFrame(test_results)

# Print test set performance metrics
print("\n--- Test Set Performance Metrics ---")
for model_name, df in model_results.items():
    print(f"\nResults for {model_name}:")
    print(df[['Sampling Technique', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC']].to_string(index=False))


In [None]:
X_test.shape

# finding best parameters with random search




In [None]:
# import pandas as pd
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import NearMiss
# from imblearn.combine import SMOTEENN
# from imblearn.under_sampling import TomekLinks
# from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
# from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
# from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from sklearn.linear_model import LogisticRegression
# import lightgbm as lgb
# import numpy as np
# import warnings

# warnings.filterwarnings("ignore")  

# # Define parameters for RandomizedSearchCV for each model
# log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}
# rf_params = {'n_estimators': [50, 100, 150], 'max_depth': [5, 10, 15], 'min_samples_split': [2, 5, 10]}
# adaboost_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 1]}
# gb_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.5], 'max_depth': [3, 5, 7]}
# xgb_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.3], 'max_depth': [3, 5, 7]}
# catboost_params = {'iterations': [50, 100, 150], 'depth': [5, 7, 10], 'learning_rate': [0.01, 0.1, 0.3]}
# lgb_params = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.3], 'max_depth': [5, 7, 10]}

# # Initialize StratifiedKFold for cross-validation
# sss = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# # Sampling techniques
# sampling_techniques = [
#     ('SMOTE', SMOTE(sampling_strategy='minority')),
#     ('TomekLinks', TomekLinks(sampling_strategy='majority')),
#     ('NearMiss', NearMiss(sampling_strategy='majority')),
#     ('SMOTEENN', SMOTEENN(sampling_strategy='auto'))
# ]

# # --- Perform Random Search to Determine Best Parameters ---

# # List of classifiers
# classifiers = {
#     'LogisticRegression': LogisticRegression(),
#     'RandomForest': RandomForestClassifier(),
#     'AdaBoost': AdaBoostClassifier(),
#     'GradientBoosting': GradientBoostingClassifier(),
#     'XGBoost': XGBClassifier(),
#     'LightGBM': lgb.LGBMClassifier(),
#     'CatBoost': CatBoostClassifier(verbose=0)
# }

# # Parameter grids for each model
# param_grids = {
#     'LogisticRegression': log_reg_params,
#     'RandomForest': rf_params,
#     'AdaBoost': adaboost_params,
#     'GradientBoosting': gb_params,
#     'XGBoost': xgb_params,
#     'LightGBM': lgb_params,
#     'CatBoost': catboost_params
# }

# best_params = {}

# for model_name, model in classifiers.items():
#     print(f"Performing Random Search for {model_name}...")
#     rand_search = RandomizedSearchCV(model, param_grids[model_name], n_iter=4, random_state=42, cv=3)
#     rand_search.fit(X_train, y_train)
#     best_params[model_name] = rand_search.best_params_
#     print(f"Best Parameters for {model_name}: {best_params[model_name]}")



# model trained using random search params

In [None]:
# import pandas as pd
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import NearMiss
# from imblearn.combine import SMOTEENN
# from imblearn.under_sampling import TomekLinks
# from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_score, recall_score, f1_score
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from sklearn.linear_model import LogisticRegression
# import lightgbm as lgb
# import numpy as np
# import matplotlib.pyplot as plt
# import warnings

# warnings.filterwarnings("ignore")

# # Initialize StratifiedKFold for cross-validation
# sss = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)



# classifiers = {
#     'LogisticRegression': LogisticRegression(penalty='l2', C=10),
#     'RandomForest': RandomForestClassifier(n_estimators=50, max_depth=15, min_samples_split=5),
#     'AdaBoost': AdaBoostClassifier(n_estimators=100, learning_rate=1),
#     'GradientBoosting': GradientBoostingClassifier(n_estimators=50, max_depth=5, learning_rate=0.5),
#     'XGBoost': XGBClassifier(n_estimators=50, max_depth=5, learning_rate=0.3),
#     'LightGBM': lgb.LGBMClassifier(n_estimators=100, max_depth=7, learning_rate=0.1),
#     'CatBoost': CatBoostClassifier(iterations=150, depth=5, learning_rate=0.3, verbose=0)
# }


# # Sampling techniques
# sampling_techniques = [
#     ('SMOTE', SMOTE(sampling_strategy='minority')),
#     ('ADASYN', ADASYN(sampling_strategy='minority')),
#     ('TomekLinks', TomekLinks(sampling_strategy='majority')),
#     ('NearMiss', NearMiss(sampling_strategy='majority')),
#     ('SMOTEENN', SMOTEENN(sampling_strategy='auto'))
# ]

# model_results = {}

# # Train and evaluate classifiers
# for model_name, model in classifiers.items():
#     print(f"Training and Evaluating {model_name}...")

#     # Store results for test set
#     test_results = []

#     # Loop through sampling techniques
#     for sampler_name, sampler in sampling_techniques:
#         print(f"Applying {sampler_name}...")

#         # Create pipeline
#         pipeline = imbalanced_make_pipeline(sampler, model)
#         pipeline.fit(X_cleared, y_cleared)

#         # Predict on test set
#         predictions_test = pipeline.predict(X_test)

#         # Compute metrics for the test set
#         test_accuracy = pipeline.score(X_test, y_test)
#         test_precision = precision_score(y_test, predictions_test)
#         test_recall = recall_score(y_test, predictions_test)
#         test_f1 = f1_score(y_test, predictions_test)
#         test_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])

#         # Append results for test set
#         test_results.append({
#             'Sampling Technique': sampler_name,
#             'Accuracy': test_accuracy,
#             'Precision': test_precision,
#             'Recall': test_recall,
#             'F1': test_f1,
#             'AUC': test_auc
#         })

#     # Save results for each classifier
#     model_results[model_name] = pd.DataFrame(test_results)

# # Print test set performance metrics
# print("\n--- Test Set Performance Metrics ---")
# for model_name, df in model_results.items():
#     print(f"\nResults for {model_name}:")
#     print(df[['Sampling Technique', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC']].to_string(index=False))


# voting classifier

In [11]:
import pandas as pd
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, make_scorer
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import numpy as np

# Initialize classifiers
gb = GradientBoostingClassifier(n_estimators=145, learning_rate=0.19587059159994072, max_depth=5)
XGB = XGBClassifier(n_estimators=150, learning_rate=0.2236622678900941, max_depth=5)
LGB = lgb.LGBMClassifier(n_estimators=134, learning_rate=0.1417403897878078, max_depth=9)
Cat = CatBoostClassifier(iterations=149, learning_rate=0.28167281754280704, depth=6, verbose=0)

# Sampling technique
tomek = TomekLinks()

# Clean data using TomekLinks
X_train_cleaned, y_train_cleaned = tomek.fit_resample(X, y)

# Voting Classifier setup
voting_clf = VotingClassifier(
    estimators=[
        ('GradientBoosting', gb),
        ('XGBoost', XGB),
        ('LightGBM', LGB),
        ('CatBoost', Cat)
    ],
    voting='soft'
)

# Define 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Scoring metrics for cross-validation
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True)
}

# Perform 5-fold cross-validation
print("Performing 5-Fold Cross-Validation on Voting Classifier...")

results = {}
for metric in scoring:
    scores = cross_val_score(voting_clf, X_train_cleaned, y_train_cleaned, cv=cv, scoring=scoring[metric], n_jobs=-1)
    results[metric] = scores

# Print mean scores
print("\n--- 5-Fold Cross-Validation Results ---")
for metric, scores in results.items():
    print(f"{metric.capitalize()}: Mean = {np.mean(scores):.4f}, Std = {np.std(scores):.4f}")


Performing 5-Fold Cross-Validation on Voting Classifier...

--- 5-Fold Cross-Validation Results ---
Accuracy: Mean = 0.9397, Std = 0.0038
Precision: Mean = 0.9027, Std = 0.0122
Recall: Mean = 0.8256, Std = 0.0114
F1: Mean = 0.8624, Std = 0.0087
Roc_auc: Mean = 0.9831, Std = 0.0007


In [10]:
import pandas as pd
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, make_scorer
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import numpy as np

# Initialize classifiers
gb = GradientBoostingClassifier(n_estimators=145, learning_rate=0.19587059159994072, max_depth=5)
XGB = XGBClassifier(n_estimators=150, learning_rate=0.2236622678900941, max_depth=5)
# LGB = lgb.LGBMClassifier(n_estimators=134, learning_rate=0.1417403897878078, max_depth=9)
# Cat = CatBoostClassifier(iterations=149, learning_rate=0.28167281754280704, depth=6, verbose=0)

# Sampling technique
tomek = TomekLinks()

# Clean data using TomekLinks
X_train_cleaned, y_train_cleaned = tomek.fit_resample(X, y)

# Voting Classifier setup
voting_clf = VotingClassifier(
    estimators=[
        ('GradientBoosting', gb),
        ('XGBoost', XGB),
        ('LightGBM', LGB),
        ('CatBoost', Cat)
    ],
    voting='soft'
)

# Define 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Scoring metrics for cross-validation
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score, needs_proba=True)
}

# Perform 5-fold cross-validation
print("Performing 5-Fold Cross-Validation on Voting Classifier...")

results = {}
for metric in scoring:
    scores = cross_val_score(voting_clf, X_train_cleaned, y_train_cleaned, cv=cv, scoring=scoring[metric], n_jobs=-1)
    results[metric] = scores

# Print mean scores
print("\n--- 5-Fold Cross-Validation Results ---")
for metric, scores in results.items():
    print(f"{metric.capitalize()}: Mean = {np.mean(scores):.4f}, Std = {np.std(scores):.4f}")


Performing 5-Fold Cross-Validation on Voting Classifier...

--- 5-Fold Cross-Validation Results ---
Accuracy: Mean = 0.9398, Std = 0.0038
Precision: Mean = 0.9027, Std = 0.0121
Recall: Mean = 0.8257, Std = 0.0114
F1: Mean = 0.8623, Std = 0.0087
Roc_auc: Mean = 0.9831, Std = 0.0007


# Stacking Classifier

In [None]:
# Import necessary libraries
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE  # Import SMOTE for oversampling

# Initialize classifiers with specified hyperparameters
gb = GradientBoostingClassifier(n_estimators=145, learning_rate=0.19587059159994072, max_depth=5, random_state=42)
xgb = XGBClassifier(n_estimators=150, learning_rate=0.2236622678900941, max_depth=5, use_label_encoder=False, eval_metric='logloss', random_state=42)
lgb = lgb.LGBMClassifier(n_estimators=134, learning_rate=0.1417403897878078, max_depth=9, random_state=42)
cat = CatBoostClassifier(iterations=149, learning_rate=0.28167281754280704, depth=6, verbose=0, random_state=42)


# Perform SMOTE Oversampling on the training set
smote = SMOTE(random_state=42)  # Initialize SMOTE
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)  
# Create Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('gb', gb),
        ('xgb', xgb),
        ('lgb', lgb),
        ('cat', cat)
    ],
    final_estimator=LogisticRegression(),
    cv=5,  # 5-fold cross-validation
    n_jobs=-1
)

# Train Stacking Classifier on the resampled dataset
stacking_clf.fit(X_train_resampled, y_train_resampled)

# Predict on test set
y_pred = stacking_clf.predict(X_test)
y_proba = stacking_clf.predict_proba(X_test)[:, 1]  # Get probability scores for ROC AUC

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Print metrics
print("Stacking Classifier Performance After SMOTE Oversampling:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


# Deep Learning CNN


In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

# Assuming you've already loaded your data and split it into X_train, X_test, y_train, and y_test

# Model Architecture
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Define the ReduceLROnPlateau callback
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',    # Monitor validation loss
    factor=0.5,            # Reduce LR by half when the validation loss plateaus
    patience=3,            # Number of epochs to wait for improvement
    min_lr=1e-6,           # Minimum learning rate
    verbose=1              # Print messages when learning rate changes
)

# Train the model with the ReduceLROnPlateau callback
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[reduce_lr])

# Predict on the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert probabilities to binary (0 or 1)
y_proba = model.predict(X_test)  # To get probability scores for ROC AUC

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Print metrics
print("Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


# Explainable AI - LIME

In [None]:
from imblearn.under_sampling import TomekLinks
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
import shap
from lime.lime_tabular import LimeTabularExplainer

# Apply TomekLinks preprocessing
tomek = TomekLinks()
X_train_cleaned, y_train_cleaned = tomek.fit_resample(X_train, y_train)

# Define the pipeline with only the model
best_pipeline = Pipeline([
    ('model', LGBMClassifier(n_estimators=134, learning_rate=0.1417403897878078, max_depth=9))  # Model
])

# Fit the pipeline on the preprocessed training data
best_pipeline.fit(X_train_cleaned, y_train_cleaned)

# Initialize SHAP explainer visualization support
shap.initjs()

print("\n--- LIME Explanations on Last Fold ---")

from lime.lime_tabular import LimeTabularExplainer

# Convert training data to NumPy array
X_train_cleaned_np = X_train_cleaned.to_numpy()

# Initialize LIME Explainer
lime_explainer = LimeTabularExplainer(
    training_data=X_train_cleaned_np,
    feature_names=list(X_train_cleaned.columns),
    class_names=["Rejected", "Approved"],
    mode="classification"
)

# Select a sample to explain (first instance in test data)
sample_idx = 0
sample = X_test.iloc[sample_idx].values  # Ensure X_test is a DataFrame or adapt accordingly
print(f"Explaining instance {sample_idx} from test set.")

# Generate explanation
lime_exp = lime_explainer.explain_instance(
    data_row=sample,
    predict_fn=best_pipeline.predict_proba
)

# Show explanation
lime_exp.show_in_notebook()  
print(lime_exp.as_list())



# SHAP Summary Plot


In [None]:
import shap
import matplotlib.pyplot as plt

# Initialize SHAP explainer
shap_explainer = shap.Explainer(best_pipeline.named_steps['model'], X_train_cleaned)

# Calculate SHAP values for the test data with additivity check disabled
shap_values = shap_explainer(X_test, check_additivity=False)

# Summary plot
print("\n--- SHAP Summary Plot ---")
shap.summary_plot(shap_values, X_test, plot_type="dot")

# Violin plot
print("\n--- SHAP Violin Plot ---")
shap.summary_plot(shap_values, X_test, plot_type="violin")
