In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
pd.set_option("display.max_row",999)
pd.set_option("display.max_column",999)

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp
import pickle
from sklearn.model_selection import train_test_split,learning_curve
from sklearn.metrics import (f1_score, confusion_matrix, classification_report, matthews_corrcoef, roc_curve, 
                             roc_auc_score,accuracy_score, recall_score, precision_score, precision_recall_curve,
                             cohen_kappa_score, log_loss )

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN  
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.utils.class_weight import compute_class_weight


# Load data

In [None]:
df=pd.read_csv("train_sessions.csv")
print('train',df.shape)
df=df.sort_index(axis=1)

df=df[['session_id','site1','site10','site2','site3','site4','site5','site6','site7','site8','site9','target']]

from sklearn.preprocessing import LabelEncoder
for col_obj in df.select_dtypes("object").columns: 
    df[col_obj] = pd.to_datetime(df[col_obj], format='%Y-%m-%d')

print(df.shape)
df=df[df.columns[df.isna().sum()/df.shape[0]<0.2]]
print(df.shape)
df.dropna(inplace=True)
print(df.shape)
for col_obj in df.select_dtypes("object").columns: 
    lb=LabelEncoder()
    df[col_obj]=lb.fit_transform(df[col_obj]) 
df=df.rename(columns={'target':'Class'})

In [None]:
# credit_data = credit_data.sample(n=20000, random_state=0) 
df.shape

# Preprocessing

In [None]:
plt.figure(figsize=(5, 5))
plt.pie((df['Class']).value_counts(), labels=(df['Class']).value_counts().index, 
        autopct='%1.1f%%', startangle=140, 
        colors=['#66b3ff','#99ff99','#ffcc99','#c2c2f0'])
plt.title('Distribution of Values')
plt.show()

In [None]:
# Define a function to remove outliers using IQR
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]


In [None]:
cols=list(df.columns)
cols.remove('Class')
len(cols)

In [None]:
for column_name in cols:
    df_cleaned = remove_outliers_iqr(df, column_name)

In [None]:
# number of outliers detected:
df.shape[0]-df_cleaned.shape[0]

In [None]:
print('% des class Original:\n',(df['Class']).value_counts().sort_values(ascending=False))

print('% des class cleaned:\n',(df_cleaned['Class']).value_counts().sort_values(ascending=False))
plt.figure(figsize=(5, 5))
plt.pie((df_cleaned['Class']).value_counts(), labels=(df_cleaned['Class']).value_counts().index, 
        autopct='%1.1f%%', startangle=140, 
        colors=['#66b3ff','#99ff99','#ffcc99','#c2c2f0'])
plt.title('Distribution of Values')
plt.show()

In [None]:
df_cleaned.isna().sum().sum()

In [None]:
df_cleaned[df_cleaned['Class']==0]

In [None]:
len(df_cleaned.columns),df_cleaned.columns

In [None]:
df_cleaned.max()

In [None]:
df_cleaned.min()

In [None]:
X = df_cleaned.drop("Class", axis=1)
y = df_cleaned["Class"]

# Resampling

In [None]:
# Random undersampling
rus = RandomUnderSampler(random_state=0)
X_rus, y_rus = rus.fit_resample(X, y)
print('% des class:\n',(y_rus).value_counts().sort_values(ascending=False))

In [None]:
# Random oversampling
ros = RandomOverSampler(random_state=0)
X_ros, y_ros = ros.fit_resample(X, y)
print('% des class:\n',(y_ros).value_counts().sort_values(ascending=False))

In [None]:
# SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=0)
X_smote, y_smote = smote.fit_resample(X, y)
print('% des class:\n',(y_smote).value_counts().sort_values(ascending=False))

In [None]:
# Hybrid method using SMOTE and Edited Nearest Neighbors (SMOTEENN)
smoteenn = SMOTEENN(random_state=0)
X_smoteenn, y_smoteenn = smoteenn.fit_resample(X, y)
print('% des class:\n',(y_smoteenn).value_counts().sort_values(ascending=False))

In [None]:
# Initialize Borderline SMOTE resampler
bsmote = BorderlineSMOTE(random_state=0)
X_bsmote, y_bsmote = bsmote.fit_resample(X, y)
print('% des class:\n', y_bsmote.value_counts().sort_values(ascending=False))

In [None]:
# Initialize ADASYN resampler
adasyn = ADASYN(random_state=0)
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
print('% des class:\n', y_adasyn.value_counts().sort_values(ascending=False))

# Evaluation Function

In [None]:
def evaluation(model,name,X,y):
    print('\n===================================',name,' - GridSearchCV ===================================')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    # Convert both sets to sets to get unique values
    train_set_unique = np.array(X_train)
    test_set_unique = np.array(X_test)
    
    # Check if both sets have the same samples
    if np.array_equal(train_set_unique, test_set_unique):
        print("Train set and test set have the same samples.")
    else:
        print('X_train shape: ',X_train.shape, 'y_train shape: =', y_train.shape)
        print('% des class dans Y train:')
        print((y_train).value_counts().sort_values(ascending=False))
        print('% des class dans Y test:')
        print((y_test).value_counts().sort_values(ascending=False))
        
        class_counts = np.bincount(y_train)
        total_samples = len(y_train)
        class_weights = [{i: total_samples / (len(class_counts) * count)} for i, count in enumerate(class_counts)]
        param_grid = {'class_weight': class_weights}
        scoring = {'matthews_corrcoef': make_scorer(matthews_corrcoef),} 
        grid_search = GridSearchCV(model, param_grid, scoring=scoring, refit='matthews_corrcoef')
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_
        
        # Saving the model
        pickle.dump(model, open(f'{name}.pkl', 'wb'))
        
        ypred = model.predict(X_test)
        # print("Train set and test set have different samples.")
        print('\n ------------------------------------------------------------------------------------------- ')
        tn, fp, fn, tp = confusion_matrix(y_test, ypred).ravel()
        print(f"{'matthews_corrcoef': <40}{matthews_corrcoef(y_test, ypred)*100}")
        print(f"{'f1_score': <40}{f1_score(y_test, ypred)*100}")
        print(f"{'accuracy_score': <40}{accuracy_score(y_test, ypred)*100}")
        print(f"{'recall_score': <40}{recall_score(y_test, ypred)}")
        print(f"{'precision_score': <40}{precision_score(y_test, ypred)}")
        print(f"{'roc_auc_score': <40}{roc_auc_score(y_test, ypred)}")
        print(f"{'false_positive_rate': <40}{fp / (fp + tn)}")
        print(f"{'negative_predictive_value': <40}{tn / (tn + fn)}")
        print(f"{'confusion_matrix'}\n{confusion_matrix(y_test, ypred)}")
        print(f"{'classification_report'}\n{classification_report(y_test, ypred)}")
        # cross validation:
        N, train_score, val_score = learning_curve(model,X_train,y_train,cv=5,scoring='f1',
                                                   train_sizes=np.linspace(0.1, 1, 10))
        fpr, tpr, thresholds = roc_curve(y_test, ypred)

        # Affichage
        fig, axes = plt.subplots(1, 2, figsize=(8, 4))  
        axes[0].plot(N, train_score.mean(axis=1), label='train score')
        axes[0].plot(N, val_score.mean(axis=1), label='validation score')
        axes[0].legend()
        axes[1].set_title('Learning Curve')  
        axes[1].plot([0, 1], [0, 1], 'k--', label = 'Base')
        axes[1].plot(fpr, tpr, color = 'blue', label = 'ROC')
        axes[1].set_xlabel('False Positive Rate')
        axes[1].set_ylabel('True Positive Rate')
        axes[1].set_title('ROC Curve')
        axes[1].legend(loc='best')

        plt.tight_layout()  
        plt.show()  
        fig.savefig(f"CMIYC_{name}.jpg", bbox_inches='tight', dpi=1000)
        print('===================================================================================================================')

#  LR, DT, RF, XGB

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

LR  = LogisticRegression()
DT  = DecisionTreeClassifier(random_state=0)
RF  = RandomForestClassifier(random_state=0)
XGB = xgb.XGBClassifier(objective='binary:logistic', random_state=0)

# RUS

In [None]:
evaluation(LR, 'CMIYC_LogisticRegression_RUS', X_rus, y_rus)
evaluation(DT, 'CMIYC_DecisionTreeClassifier_RUS', X_rus, y_rus)
evaluation(RF, 'CMIYC_RandomForestClassifier_RUS', X_rus, y_rus)
evaluation(XGB, 'CMIYC_XGBClassifier_RUS', X_rus, y_rus)

# ROS 

In [None]:
evaluation(LR, 'CMIYC_LogisticRegression_ROS', X_ros, y_ros)
evaluation(DT, 'CMIYC_DecisionTreeClassifier_ROS', X_ros, y_ros)
evaluation(RF, 'CMIYC_RandomForestClassifier_ROS', X_ros, y_ros)
evaluation(XGB, 'CMIYC_XGBClassifier_ROS', X_ros, y_ros)

# SMOTE

In [None]:
evaluation(LR, 'CMIYC_LogisticRegression_SMOTE', X_smote, y_smote)
evaluation(DT, 'CMIYC_DecisionTreeClassifier_SMOTE', X_smote, y_smote)
evaluation(RF, 'CMIYC_RandomForestClassifier_SMOTE', X_smote, y_smote)
evaluation(XGB, 'CMIYC_XGBClassifier_SMOTE', X_smote, y_smote)

# SMOTEENN

In [None]:
evaluation(LR, 'CMIYC_LogisticRegression_SMOTEENN', X_smoteenn, y_smoteenn)
evaluation(DT, 'CMIYC_DecisionTreeClassifier_SMOTEENN', X_smoteenn, y_smoteenn)
evaluation(RF, 'CMIYC_RandomForestClassifier_SMOTEENN', X_smoteenn, y_smoteenn)
evaluation(XGB, 'CMIYC_XGBClassifier_SMOTEENN', X_smoteenn, y_smoteenn)

# BorderlineSMOTE

In [None]:
evaluation(LR, 'CMIYC_LogisticRegression_BSMOTE', X_bsmote, y_bsmote)
evaluation(DT, 'CMIYC_DecisionTreeClassifier_BSMOTE', X_bsmote, y_bsmote)
evaluation(RF, 'CMIYC_RandomForestClassifier_BSMOTE', X_bsmote, y_bsmote)
evaluation(XGB, 'CMIYC_XGBClassifier_BSMOTE', X_bsmote, y_bsmote)

# ADASYN

In [None]:
evaluation(LR, 'CMIYC_LogisticRegression_ADASYN', X_adasyn, y_adasyn )
evaluation(DT, 'CMIYC_DecisionTreeClassifier_ADASYN', X_adasyn, y_adasyn )
evaluation(RF, 'CMIYC_RandomForestClassifier_ADASYN', X_adasyn, y_adasyn )
evaluation(XGB, 'CMIYC_XGBClassifier_ADASYN', X_adasyn, y_adasyn )