In [1]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# STANDARDISATION

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def standardize_dataframe(dataframe):
    # Sélectionner uniquement les colonnes numériques
    numeric_cols = dataframe.select_dtypes(include=['float64', 'int64','int32']).columns
    # Copier le DataFrame pour éviter de modifier l'original
    standardized_df = dataframe.copy()
    # Standardiser les colonnes numériques
    scaler = StandardScaler()
    standardized_df[numeric_cols] = scaler.fit_transform(dataframe[numeric_cols])
    
    return standardized_df

In [3]:
def maximisation_marge(montant,status):
    taux_marge = 0.05
    if status == "TP":
        res = 0 # le fraudeur est détecté
    elif status == "TN":
        res = taux_marge * montant # un client honnête est accepté
    elif status == "FP":
        res = 0.7*taux_marge * montant # un client honnête est bloqué     
    elif status == "FN": # un fraudeur est accepté 
        if montant <= 20:
            res = 0
        elif montant <= 50:
            res = -0.2 * montant
        elif montant <= 100:
            res = -0.3 * montant
        elif montant <= 200:
            res = -0.5 * montant
        else:
            res = -0.8 * montant
    return res

# INITIALISATION - SCORER

In [4]:
def custom_score(y_true, y_pred, montants):
    total_marge = 0
    inde = 0
    for i in range(len(montants)):
        status = ""
        if y_true[inde] == 1 and y_pred[inde] == 1:
            status = "TP"
        elif y_true[inde] == 0 and y_pred[inde] == 0:
            status = "TN"
        elif y_true[inde] == 0 and y_pred[inde] == 1:
            status = "FP"
        elif y_true[inde] == 1 and y_pred[inde] == 0:
            status = "FN" 

        total_marge += maximisation_marge(montants[inde], status)
        inde += 1
    
    return total_marge

# INITIALISATION - CLASSIFIERS

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

class_weights = {0: 1.0, 1: 20.0}

models = {
    #'Random_Forest': RandomForestClassifier(), # LUI
    #'xgb_model': xgb.XGBClassifier(), # LUI
    #'Gradient_Boosting': GradientBoostingClassifier(),
    'Support_Vector_Machine': SVC(), # LUI
    'Linear_Discriminant_Analysis': LinearDiscriminantAnalysis() # LUI
    #'Logistic_Regression': LogisticRegression()
}
print(models)

{'Support_Vector_Machine': SVC(), 'Linear_Discriminant_Analysis': LinearDiscriminantAnalysis()}


In [6]:
param_grids = {
    'Random_Forest': {
        'criterion': ['gini'],
        'n_estimators': [5,100],
        'max_depth': [None, 10],
        'class_weight' : [None, class_weights],
        'random_state': [42]
        },
    'xgb_model': {
        'objective': ['binary:logistic'],
        'n_estimators': [5,100],
        #'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [None,3,10],
        'subsample': [0.5,1.0],
        #'colsample_bytree': [0.8, 1.0],
        #'gamma': [0, 0.1, 0.2],
        #'min_child_weight': [1, 5, 10],
        'random_state': [42]
    },
    'Gradient_Boosting': {
        #'n_estimators': [50, 100, 200],
        #'learning_rate': [0.01, 0.1, 0.2],
        #'max_depth': [3, 5, 7],
        #'subsample': [0.8, 1.0],
        #'min_samples_split': [2, 5, 10],
        #'min_samples_leaf': [1, 2, 4],
        #'max_features': [None, 'sqrt', 'log2'],
        'random_state': [42]
    },
    'K_Nearest_Neighbors': {
        'n_neighbors': [3,10],
        #'weights': ['uniform', 'distance'],
        #'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        #'leaf_size': [20, 30, 40],
        #'p': [1, 2]
    },
    'Support_Vector_Machine': {
        'C': [0.1, 1.0, 5.0],
        'kernel': ['linear','rbf'],
        #'degree': [2, 3, 4],
        #'gamma': ['scale', 'auto'],
        'class_weight': [None,class_weights],
        'random_state': [42]    
       },
   'Neural_Network': {
        #'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        #'activation': ['relu', 'tanh', 'logistic'],
        #'solver': ['sgd', 'adam'],
        #'alpha': [0.0001, 0.001, 0.01],
        #'learning_rate': ['constant', 'invscaling', 'adaptive'],
        #'max_iter': [100, 200, 300],
        #'early_stopping': [True, False],
        'random_state': [42]
    },
    'Linear_Discriminant_Analysis': {
        'solver': ['svd', 'lsqr'],
        'shrinkage': [None, 'auto']
        #'n_components': [None, 1, 2, 3]
    },
     'Logistic_Regression': {
        #'penalty': ['l1', 'l2'],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100],
        #'fit_intercept': [True, False],
        #'class_weight': [None, 'balanced'],
        #'solver': ['liblinear', 'saga'],
        #'max_iter': [100, 200, 300],
        'class_weight' : [None, class_weights],
        'random_state': [42]
    }
}


In [7]:
methods = ["simple","undersampling","smote"]
percents = ["1","3","5"]

# TRAINNING : GRID SEARCH - OPTIMISATION F1

In [8]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import make_scorer
# from sklearn.metrics import f1_score 
# from sklearn.model_selection import TimeSeriesSplit
# import pickle


# for m in methods:
#     if m == "simple":
#         df_train = pd.read_csv("../data/"+m+"/dataframe_train.csv")
#         df_train = df_train.sort_values(by="Heure")

#         X_train = df_train.drop(columns=["FlagImpaye","CodeDecision"])
#         X_train = standardize_dataframe(X_train) # on standardise les données
#         y_train = df_train["FlagImpaye"]

#         # Boucle sur chaque modèle
#         for model_name, model in models.items():
#             print(f"\nTraining {model_name} for method {m}")

#             # Définir les paramètres que vous souhaitez tester dans la recherche de grille
#             param_grid = param_grids[model_name]
#             print(param_grid)

#             f1_scorer = make_scorer(f1_score,greater_is_better=True)

#             # Utiliser TimeSeriesSplit pour la validation croisée
#             tscv = TimeSeriesSplit(n_splits=4)
    
#             # Créer la grille de recherche avec votre fonction personnalisée comme mesure d'évaluation
#             grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, cv=tscv, n_jobs=-1)

#             # Effectuer la recherche de grille
#             grid_search.fit(X_train, y_train)

    
#             # Afficher les résultats
#             print(f"\n     Best parameters for {model_name}: ", grid_search.best_params_)
#             print(f"     Meilleur f1 score pour {model_name}: ", grid_search.best_score_)

#             # Sauvegarder le meilleur modèle si nécessaire
#             best_model = grid_search.best_estimator_
#             filename = '../models/'+m+"/"+ model_name + '.pkl'
#             pickle.dump(best_model, open(filename, "wb"))

#     else:
#         for p in percents:
#             df_train = pd.read_csv("../data/"+m+"/dataframe_train_"+p+"_percent.csv")
#             df_train = df_train.sort_values(by="Heure")
        
#             X_train = df_train.drop(columns=["FlagImpaye","CodeDecision"])

#             X_train = standardize_dataframe(X_train) # on standardise les données
#             y_train = df_train["FlagImpaye"]

#             # Boucle sur chaque modèle
#             for model_name, model in models.items():
#                 print(f"\nTraining {model_name} for method {m} and {p} % of frauds")

#                 # Définir les paramètres que vous souhaitez tester dans la recherche de grille
#                 param_grid = param_grids[model_name]
#                 print(param_grid)

#                 f1_scorer = make_scorer(f1_score,greater_is_better=True)

#                 # Utiliser TimeSeriesSplit pour la validation croisée
#                 tscv = TimeSeriesSplit(n_splits=4)
                
                
#                 # Créer la grille de recherche avec votre fonction personnalisée comme mesure d'évaluation
#                 grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, cv=tscv, n_jobs=-1)

#                 # Effectuer la recherche de grille
#                 grid_search.fit(X_train, y_train)

                
#                 # Afficher les résultats
#                 print(f"\n     Best parameters for {model_name}: ", grid_search.best_params_)
#                 print(f"     Meilleur f1 score pour {model_name}: ", grid_search.best_score_)

#                 # Sauvegarder le meilleur modèle si nécessaire
#                 best_model = grid_search.best_estimator_
#                 filename = '../models/'+m+"/"+p+"/"+ model_name + '.pkl'
#                 pickle.dump(best_model, open(filename, "wb"))

# GRID SEARCH MARGE 

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score 
from sklearn.model_selection import TimeSeriesSplit
import pickle
from itertools import product
import numpy as np
from collections import Counter


for m in methods:
    if m == "simple":
        df_train = pd.read_csv("../data/"+m+"/dataframe_train.csv")
        for model_name, model in models.items():
                print(f"\nTraining {model_name} for method {m}")

                # Définir les paramètres que vous souhaitez tester dans la recherche de grille
                param_grid = param_grids[model_name]

                # Initialiser les variables pour stocker les meilleurs paramètres et le meilleur score
                best_params = None
                best_score = 0

                # Effectuer une recherche par grille manuelle
                for params in product(*param_grid.values()):
                    param_dict = dict(zip(param_grid.keys(), params))
                    print(param_dict)

                    # Initialiser le modèle avec les paramètres actuels
                    clf = model.set_params(**param_dict)

                    # Effectuer une validation croisée avec TimeSeriesSplit
                    tscv = TimeSeriesSplit(n_splits=4)
                    scores = []
                    
                    df_train_sorted = df_train.sort_values(by="Heure")

                    X_train = df_train_sorted.drop(columns=["FlagImpaye","CodeDecision"])
                    y_train = df_train_sorted["FlagImpaye"]
                    
                    for train_index, val_index in tscv.split(X_train):
                        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                        # train :
                        X_train_fold = standardize_dataframe(X_train_fold) # on standardise les données*
                        
                        #test :
                        montants = X_val_fold["Montant"]
                        montants = montants.values
                        X_val_fold = standardize_dataframe(X_val_fold) # on standardise les données*

                        # Entraîner le modèle
                        clf.fit(X_train_fold, y_train_fold)

                        # Prédire sur l'ensemble de validation
                        y_val_pred = clf.predict(X_val_fold)

                        y_val_fold = y_val_fold.values
                        y_val_pred = y_val_pred.tolist()     
                        y_val_fold = y_val_fold.tolist()
                        montants = montants.tolist()

                        
                        score_marge = custom_score(y_val_fold, y_val_pred,montants)        
                        print("  marge : ",score_marge)
                        scores.append(score_marge)
                    
                    # Calculer le score moyen sur les plis
                    avg_score = np.mean(scores)
                    print("   moyenne : ",avg_score)

                    if avg_score > best_score:
                        best_score = avg_score
                        best_params = param_dict
                        best_model = clf


                # Afficher les résultats
                print(f"\n     Best parameters pour {model_name}: {best_params}")
                #print(f"     Best marge pour {model_name}: {best_score}")

                # Sauvegarder le meilleur modèle
                filename = '../models/'+m+"/"+ model_name + '.pkl'
                pickle.dump(best_model, open(filename, "wb"))
    else:
        for p in percents:
            df_train = pd.read_csv("../data/"+m+"/dataframe_train_"+p+"_percent.csv")

            # Boucle sur chaque modèle
            for model_name, model in models.items():
                print(f"\nTraining {model_name} for method {m} and {p} % of frauds")

                # Définir les paramètres que vous souhaitez tester dans la recherche de grille
                param_grid = param_grids[model_name]

                # Initialiser les variables pour stocker les meilleurs paramètres et le meilleur score
                best_params = None
                best_score = 0

                # Effectuer une recherche par grille manuelle
                for params in product(*param_grid.values()):
                    param_dict = dict(zip(param_grid.keys(), params))
                    print(param_dict)

                    # Initialiser le modèle avec les paramètres actuels
                    clf = model.set_params(**param_dict)

                    # Effectuer une validation croisée avec TimeSeriesSplit
                    tscv = TimeSeriesSplit(n_splits=4)
                    scores = []
                    
                    df_train_sorted = df_train.sort_values(by="Heure")

                    X_train = df_train_sorted.drop(columns=["FlagImpaye","CodeDecision"])
                    y_train = df_train_sorted["FlagImpaye"]
                    
                    for train_index, val_index in tscv.split(X_train):
                        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                        # train :
                        X_train_fold = standardize_dataframe(X_train_fold) # on standardise les données*
                        
                        #test :
                        montants = X_val_fold["Montant"]
                        montants = montants.values
                        X_val_fold = standardize_dataframe(X_val_fold) # on standardise les données*

                        # Entraîner le modèle
                        clf.fit(X_train_fold, y_train_fold)

                        # Prédire sur l'ensemble de validation
                        y_val_pred = clf.predict(X_val_fold)

                        y_val_fold = y_val_fold.values
                        y_val_pred = y_val_pred.tolist()     
                        y_val_fold = y_val_fold.tolist()
                        montants = montants.tolist()

                        
                        score_marge = custom_score(y_val_fold, y_val_pred,montants)        
                        print("  marge : ",score_marge)
                        scores.append(score_marge)
                    
                    # Calculer le score moyen sur les plis
                    avg_score = np.mean(scores)
                    print("   moyenne : ",avg_score)

                    if avg_score > best_score:
                        best_score = avg_score
                        best_params = param_dict
                        best_model = clf


                # Afficher les résultats
                print(f"\n     Best parameters pour {model_name}: {best_params}")
                #print(f"     Best marge pour {model_name}: {best_score}")

                # Sauvegarder le meilleur modèle
                filename = '../models/'+m+"/"+p+"/"+ model_name + '.pkl'
                pickle.dump(best_model, open(filename, "wb"))



Training Support_Vector_Machine for method simple
{'C': 0.1, 'kernel': 'linear', 'class_weight': None, 'random_state': 42}


# LOAD MODELS

In [None]:
import pickle


loaded_models = {method: {percent: None for percent in percents} for method in methods}

for m in methods:
    if m == "simple":
        loaded_models[m] = {}
        for model_name in models.keys():
            filename = '../models/'+m+"/"+ model_name + '.pkl'
            with open(filename, 'rb') as file:
                loaded_models[m][model_name] = pickle.load(file)
    else:
        for p in percents:
            loaded_models[m][p] = {}
            for model_name in models.keys():
                filename = '../models/'+m+"/"+p+"/"+ model_name + '.pkl'
                with open(filename, 'rb') as file:
                    loaded_models[m][p][model_name] = pickle.load(file)


# PREDICTING WITH BEST MARGE MODELS

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

index_modified = []
for m in methods:
    if m == "simple":
        index_modified.append(m)
    else:
        for p in percents:
            index_modified.append(m+"_"+p)
  

f1_df = pd.DataFrame(index=index_modified, columns=models.keys())
marge_df = pd.DataFrame(index=index_modified, columns=models.keys())
montant_gagne_df = pd.DataFrame(index=index_modified, columns=models.keys())

df_test = pd.read_csv("../data/simple/dataframe_test.csv")
X_test = df_test.drop(columns=["FlagImpaye","CodeDecision"])
montants = X_test["Montant"]
montants = montants.values
montants = montants.tolist()

X_test = standardize_dataframe(X_test) # on standardise les données test

y_test = df_test["FlagImpaye"]
y_test = y_test.values
y_test = y_test.tolist()

for m in methods:
    if m == "simple":
        for model_name, model in loaded_models.get(m, {}).items():
            print(f"\n #### Testing {model_name} for method {m} ####")
            y_pred = model.predict(X_test)
            y_pred = y_pred.tolist()

            nom_ligne = m

            # f1 score
            f1 = f1_score(y_test, y_pred)
            f1_df.loc[nom_ligne, model_name] = f1

            # marge
            score_marge = custom_score(y_test, y_pred,montants)   
            score_marge = round(score_marge,2)
            marge_df.loc[nom_ligne, model_name] = score_marge

         
            marge_laisse_passer_tout_le_monde = round(custom_score(y_test,[0]*len(y_test),montants),2)
            montant_gagne = round(score_marge - marge_laisse_passer_tout_le_monde,2)
            montant_gagne_df.at[nom_ligne ,model_name] = montant_gagne
           

            # conf_matrix = confusion_matrix(y_test, y_pred)
            # disp = ConfusionMatrixDisplay(conf_matrix, display_labels=[False, True])
            # disp.plot()
            # plt.show()

            print("\n     F1 score de ",model_name," sur l'ensemble de test :", f1)
            print("     Marge de ",model_name," sur l'ensemble de test :", score_marge)
            print("     Montant gagné avec le modèle :", montant_gagne)
            
    else:
        for p, model_dic in loaded_models.get(m, {}).items():
            for model_name, model in model_dic.items():
                print(f"\n #### Testing {model_name} for method {m} and {p} % of frauds ####")

                y_pred = model.predict(X_test)
                y_pred = y_pred.tolist()

                nom_ligne = m+"_"+p

                # f1 score
                f1 = f1_score(y_test, y_pred)
                f1_df.loc[nom_ligne, model_name] = f1

                # marge
                score_marge = custom_score(y_test, y_pred,montants)   
                score_marge = round(score_marge,2)
                marge_df.loc[nom_ligne, model_name] = score_marge

                marge_laisse_passer_tout_le_monde = round(custom_score(y_test,[0]*len(y_test),montants),2)
                montant_gagne = round(score_marge - marge_laisse_passer_tout_le_monde,2)
                montant_gagne_df.at[nom_ligne ,model_name] = montant_gagne
        
                # conf_matrix = confusion_matrix(y_test, y_pred)
                # disp = ConfusionMatrixDisplay(conf_matrix, display_labels=[False, True])
                # disp.plot()
                # plt.show()

                print("\n     F1 score de ",model_name," sur l'ensemble de test :", f1)
                print("     Marge de ",model_name," sur l'ensemble de test :", score_marge)
                print("     Montant gagné avec le modèle :", montant_gagne)


    
    

Random_Forest
RandomForestClassifier(class_weight={0: 1.0, 1: 20.0}, max_depth=10,
                       random_state=42)

 #### Testing Random_Forest for method simple ####

     F1 score de  Random_Forest  sur l'ensemble de test : 0.09256833464326261
     Marge de  Random_Forest  sur l'ensemble de test : 2002581.73
     Montant gagné avec le modèle : 60730.04000000004
xgb_model
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constra

In [None]:
marge_parfaite = round(custom_score(y_test, y_test,montants),2)
marge_laisse_passer_tout_le_monde = round(custom_score(y_test,[0]*len(y_test),montants),2)

print("Marge Parfaite : ",marge_parfaite)
print("Marge si on laisse passer tout le monde : ",marge_laisse_passer_tout_le_monde)

Marge Parfait :  2294459.36
Marge si on laisse passer tout le monde :  1941851.69


# Marge 

In [None]:
marge_df.to_csv('../data/marge.csv')
marge_df

Unnamed: 0,Random_Forest,xgb_model
simple,2002581.73,2014809.57
undersampling_1,1949305.8,1968845.38
undersampling_3,1984555.61,2046338.05
undersampling_5,1958502.65,2025025.42
smote_1,1942056.2,1878086.19
smote_3,1945739.11,1870332.09
smote_5,1944881.13,1946809.03


# Montant gagné

In [None]:
montant_gagne_df.to_csv('../data/montant_gagne.csv')
montant_gagne_df

Unnamed: 0,Random_Forest,xgb_model
simple,60730.04,72957.88
undersampling_1,7454.11,26993.69
undersampling_3,42703.92,104486.36
undersampling_5,16650.96,83173.73
smote_1,204.51,-63765.5
smote_3,3887.42,-71519.6
smote_5,3029.44,4957.34


# F1 Score

In [None]:
f1_df.to_csv('../data/f1_score.csv')
f1_df

Unnamed: 0,Random_Forest,xgb_model
simple,0.092568,0.053275
undersampling_1,0.010066,0.029808
undersampling_3,0.028452,0.079176
undersampling_5,0.027379,0.038525
smote_1,0.003036,0.020569
smote_3,0.006263,0.018605
smote_5,0.003938,0.026839
