In [1]:
import pickle

with open('dataframe.pkl', 'rb') as file:
    df = pickle.load(file)

### On définit la partie Train et Test

In [2]:
# Définition des dates limites pour les ensembles d'apprentissage et de test
train_inf = '2017-02-01'
train_sup = '2017-08-31'
test_inf = '2017-09-01'
test_sup = '2017-11-30'

In [3]:
train = df.loc[(df['DateTransaction'] >= train_inf) & (df['DateTransaction'] <= train_sup)]
X_train = train.drop(columns=['FlagImpaye','CodeDecision','DateTransaction'])
y_train = train['FlagImpaye']

In [4]:
test = df.loc[(df['DateTransaction'] >= test_inf) & (df['DateTransaction'] <= test_sup)]
X_test = test.drop(columns=['FlagImpaye','CodeDecision','DateTransaction'])
y_test = test['FlagImpaye']

In [5]:
print(len(X_train))
print(len(X_test))

3888468
737068


## Test de chaque méthode d'échantillonage par modeles

In [5]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from tqdm import tqdm
from sklearn import metrics
from sklearn.metrics import f1_score
#!pip install 'imblearn'
#!pip install xgboost

In [6]:
# Définir les stratégies d'échantillonnage
sampling_strategies = [0.05, 0.1]  # Ajoutez les valeurs que vous souhaitez tester
#on garde deux oversampling et deux undersampling
samplers = [RandomOverSampler, SMOTE, RandomUnderSampler, NearMiss]

On fait tourner DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier,KNeighborsClassifier, SVC, adaboost, xgboost et LogisticRegression 

DecisionTreeClassifier:

In [9]:
# Définir les modèles à tester
models = [DecisionTreeClassifier()]

# Initialiser un DataFrame pour stocker les résultats
results_df_no_sampling = pd.DataFrame(columns=['Model', 'Sampler', 'Sampling Strategy', 'F1 Score'])

# Parcourir chaque modèle
for model in models:
    # Boucler sur chaque échantillonneur (sampler)
    for sampler in samplers:
        # Parcourir chaque valeur de sampling_strategy
        for strategy in sampling_strategies:
            # Entraîner le modèle sur les données d'entraînement d'origine
            model.fit(X_train, y_train)
            
            # Faire des prédictions sur l'ensemble de test
            y_pred = model.predict(X_test)

            # Évaluer les performances du modèle
            f1 = f1_score(y_test, y_pred)

            # Ajouter les résultats au DataFrame
            results_df_no_sampling = pd.concat([results_df_no_sampling, pd.DataFrame({
                'Model': [model.__class__.__name__],
                'Sampler': [sampler.__name__],
                'Sampling Strategy': [strategy],
                'F1 Score': [f1]
            })], ignore_index=True)

            # Afficher le message de progression
            tqdm.write(f"Model: {model.__class__.__name__}, Sampler: {sampler.__name__}, Strategy: {strategy}, F1 Score: {f1}")

# Afficher le tableau récapitulatif
print(results_df_no_sampling)
results_df_no_sampling.to_pickle('dataframe_recap_no_sampling.pkl')

  results_df_no_sampling = pd.concat([results_df_no_sampling, pd.DataFrame({


Model: DecisionTreeClassifier, Sampler: RandomOverSampler, Strategy: 0.05, F1 Score: 0.02942850174782538
Model: DecisionTreeClassifier, Sampler: RandomOverSampler, Strategy: 0.1, F1 Score: 0.030453588024375734
Model: DecisionTreeClassifier, Sampler: SMOTE, Strategy: 0.05, F1 Score: 0.029341692789968652
Model: DecisionTreeClassifier, Sampler: SMOTE, Strategy: 0.1, F1 Score: 0.03085717997856238
Model: DecisionTreeClassifier, Sampler: RandomUnderSampler, Strategy: 0.05, F1 Score: 0.030474097017535094
Model: DecisionTreeClassifier, Sampler: RandomUnderSampler, Strategy: 0.1, F1 Score: 0.031470992391996515
Model: DecisionTreeClassifier, Sampler: NearMiss, Strategy: 0.05, F1 Score: 0.030382980153595376
Model: DecisionTreeClassifier, Sampler: NearMiss, Strategy: 0.1, F1 Score: 0.03037588122543127
                    Model             Sampler  Sampling Strategy  F1 Score
0  DecisionTreeClassifier   RandomOverSampler               0.05  0.029429
1  DecisionTreeClassifier   RandomOverSampler    

RandomForestClassifier:

In [11]:
# Définir les modèles à tester
models = [RandomForestClassifier()]

# Initialiser un DataFrame pour stocker les résultats
results_df_no_sampling_2 = pd.DataFrame(columns=['Model', 'Sampler', 'Sampling Strategy', 'F1 Score'])

# Parcourir chaque modèle
for model in models:
    # Boucler sur chaque échantillonneur (sampler)
    for sampler in samplers:
        # Parcourir chaque valeur de sampling_strategy
        for strategy in sampling_strategies:
            # Entraîner le modèle sur les données d'entraînement d'origine
            model.fit(X_train, y_train)
            
            # Faire des prédictions sur l'ensemble de test
            y_pred = model.predict(X_test)

            # Évaluer les performances du modèle
            f1 = f1_score(y_test, y_pred)

            # Ajouter les résultats au DataFrame
            results_df_no_sampling_2 = pd.concat([results_df_no_sampling_2, pd.DataFrame({
                'Model': [model.__class__.__name__],
                'Sampler': [sampler.__name__],
                'Sampling Strategy': [strategy],
                'F1 Score': [f1]
            })], ignore_index=True)

            # Afficher le message de progression
            tqdm.write(f"Model: {model.__class__.__name__}, Sampler: {sampler.__name__}, Strategy: {strategy}, F1 Score: {f1}")

# Afficher le tableau récapitulatif
print(results_df_no_sampling_2)
results_df_no_sampling_2.to_pickle('dataframe_recap_no_sampling_2.pkl')

  results_df_no_sampling_2 = pd.concat([results_df_no_sampling_2, pd.DataFrame({


Model: RandomForestClassifier, Sampler: RandomOverSampler, Strategy: 0.05, F1 Score: 0.038950276243093926


SVC:

In [None]:
# Définir les modèles à tester
models = [SVC()]

# Initialiser un DataFrame pour stocker les résultats
results_df_no_sampling_3 = pd.DataFrame(columns=['Model', 'Sampler', 'Sampling Strategy', 'F1 Score'])

# Parcourir chaque modèle
for model in models:
    # Boucler sur chaque échantillonneur (sampler)
    for sampler in samplers:
        # Parcourir chaque valeur de sampling_strategy
        for strategy in sampling_strategies:
            # Entraîner le modèle sur les données d'entraînement d'origine
            model.fit(X_train, y_train)
            
            # Faire des prédictions sur l'ensemble de test
            y_pred = model.predict(X_test)

            # Évaluer les performances du modèle
            f1 = f1_score(y_test, y_pred)

            # Ajouter les résultats au DataFrame
            results_df_no_sampling_3 = pd.concat([results_df_no_sampling_3, pd.DataFrame({
                'Model': [model.__class__.__name__],
                'Sampler': [sampler.__name__],
                'Sampling Strategy': [strategy],
                'F1 Score': [f1]
            })], ignore_index=True)

            # Afficher le message de progression
            tqdm.write(f"Model: {model.__class__.__name__}, Sampler: {sampler.__name__}, Strategy: {strategy}, F1 Score: {f1}")

# Afficher le tableau récapitulatif
print(results_df_no_sampling_3)
results_df_no_sampling_3.to_pickle('dataframe_recap_no_sampling_3.pkl')

AdaBoostClassifier

In [10]:
# Définir les modèles à tester
models = [AdaBoostClassifier()]

# Initialiser un DataFrame pour stocker les résultats
results_df_no_sampling_4 = pd.DataFrame(columns=['Model', 'Sampler', 'Sampling Strategy', 'F1 Score'])

# Parcourir chaque modèle
for model in models:
    # Boucler sur chaque échantillonneur (sampler)
    for sampler in samplers:
        # Parcourir chaque valeur de sampling_strategy
        for strategy in sampling_strategies:
            # Entraîner le modèle sur les données d'entraînement d'origine
            model.fit(X_train, y_train)
            
            # Faire des prédictions sur l'ensemble de test
            y_pred = model.predict(X_test)

            # Évaluer les performances du modèle
            f1 = f1_score(y_test, y_pred)

            # Ajouter les résultats au DataFrame
            results_df_no_sampling_4 = pd.concat([results_df_no_sampling_4, pd.DataFrame({
                'Model': [model.__class__.__name__],
                'Sampler': [sampler.__name__],
                'Sampling Strategy': [strategy],
                'F1 Score': [f1]
            })], ignore_index=True)

            # Afficher le message de progression
            tqdm.write(f"Model: {model.__class__.__name__}, Sampler: {sampler.__name__}, Strategy: {strategy}, F1 Score: {f1}")

# Afficher le tableau récapitulatif
print(results_df_no_sampling_4)
results_df_no_sampling_4.to_pickle('dataframe_recap_no_sampling_4.pkl')

  results_df_no_sampling_4 = pd.concat([results_df_no_sampling_4, pd.DataFrame({


Model: AdaBoostClassifier, Sampler: RandomOverSampler, Strategy: 0.05, F1 Score: 0.0109772831224272




Model: AdaBoostClassifier, Sampler: RandomOverSampler, Strategy: 0.1, F1 Score: 0.0109772831224272




Model: AdaBoostClassifier, Sampler: SMOTE, Strategy: 0.05, F1 Score: 0.0109772831224272




Model: AdaBoostClassifier, Sampler: SMOTE, Strategy: 0.1, F1 Score: 0.0109772831224272




Model: AdaBoostClassifier, Sampler: RandomUnderSampler, Strategy: 0.05, F1 Score: 0.0109772831224272




Model: AdaBoostClassifier, Sampler: RandomUnderSampler, Strategy: 0.1, F1 Score: 0.0109772831224272




Model: AdaBoostClassifier, Sampler: NearMiss, Strategy: 0.05, F1 Score: 0.0109772831224272




Model: AdaBoostClassifier, Sampler: NearMiss, Strategy: 0.1, F1 Score: 0.0109772831224272
                Model             Sampler  Sampling Strategy  F1 Score
0  AdaBoostClassifier   RandomOverSampler               0.05  0.010977
1  AdaBoostClassifier   RandomOverSampler               0.10  0.010977
2  AdaBoostClassifier               SMOTE               0.05  0.010977
3  AdaBoostClassifier               SMOTE               0.10  0.010977
4  AdaBoostClassifier  RandomUnderSampler               0.05  0.010977
5  AdaBoostClassifier  RandomUnderSampler               0.10  0.010977
6  AdaBoostClassifier            NearMiss               0.05  0.010977
7  AdaBoostClassifier            NearMiss               0.10  0.010977


XGBClassifier:

In [7]:
# Définir les modèles à tester
models = [XGBClassifier()]

# Initialiser un DataFrame pour stocker les résultats
results_df_no_sampling_5 = pd.DataFrame(columns=['Model', 'Sampler', 'Sampling Strategy', 'F1 Score'])

# Parcourir chaque modèle
for model in models:
    # Boucler sur chaque échantillonneur (sampler)
    for sampler in samplers:
        # Parcourir chaque valeur de sampling_strategy
        for strategy in sampling_strategies:
            # Entraîner le modèle sur les données d'entraînement d'origine
            model.fit(X_train, y_train)
            
            # Faire des prédictions sur l'ensemble de test
            y_pred = model.predict(X_test)

            # Évaluer les performances du modèle
            f1 = f1_score(y_test, y_pred)

            # Ajouter les résultats au DataFrame
            results_df_no_sampling_5 = pd.concat([results_df_no_sampling_5, pd.DataFrame({
                'Model': [model.__class__.__name__],
                'Sampler': [sampler.__name__],
                'Sampling Strategy': [strategy],
                'F1 Score': [f1]
            })], ignore_index=True)

            # Afficher le message de progression
            tqdm.write(f"Model: {model.__class__.__name__}, Sampler: {sampler.__name__}, Strategy: {strategy}, F1 Score: {f1}")

# Afficher le tableau récapitulatif
print(results_df_no_sampling_5)
results_df_no_sampling_5.to_pickle('dataframe_recap_no_sampling_5.pkl')

  results_df_no_sampling_5 = pd.concat([results_df_no_sampling_5, pd.DataFrame({


Model: XGBClassifier, Sampler: RandomOverSampler, Strategy: 0.05, F1 Score: 0.06242712350883487
Model: XGBClassifier, Sampler: RandomOverSampler, Strategy: 0.1, F1 Score: 0.06242712350883487
Model: XGBClassifier, Sampler: SMOTE, Strategy: 0.05, F1 Score: 0.06242712350883487
Model: XGBClassifier, Sampler: SMOTE, Strategy: 0.1, F1 Score: 0.06242712350883487
Model: XGBClassifier, Sampler: RandomUnderSampler, Strategy: 0.05, F1 Score: 0.06242712350883487
Model: XGBClassifier, Sampler: RandomUnderSampler, Strategy: 0.1, F1 Score: 0.06242712350883487
Model: XGBClassifier, Sampler: NearMiss, Strategy: 0.05, F1 Score: 0.06242712350883487
Model: XGBClassifier, Sampler: NearMiss, Strategy: 0.1, F1 Score: 0.06242712350883487
           Model             Sampler  Sampling Strategy  F1 Score
0  XGBClassifier   RandomOverSampler               0.05  0.062427
1  XGBClassifier   RandomOverSampler               0.10  0.062427
2  XGBClassifier               SMOTE               0.05  0.062427
3  XGBClassi

LogisticRegression:

In [8]:
# Définir les modèles à tester
models = [LogisticRegression()]

# Initialiser un DataFrame pour stocker les résultats
results_df_no_sampling_6 = pd.DataFrame(columns=['Model', 'Sampler', 'Sampling Strategy', 'F1 Score'])

# Parcourir chaque modèle
for model in models:
    # Boucler sur chaque échantillonneur (sampler)
    for sampler in samplers:
        # Parcourir chaque valeur de sampling_strategy
        for strategy in sampling_strategies:
            # Entraîner le modèle sur les données d'entraînement d'origine
            model.fit(X_train, y_train)
            
            # Faire des prédictions sur l'ensemble de test
            y_pred = model.predict(X_test)

            # Évaluer les performances du modèle
            f1 = f1_score(y_test, y_pred)

            # Ajouter les résultats au DataFrame
            results_df_no_sampling_6 = pd.concat([results_df_no_sampling_6, pd.DataFrame({
                'Model': [model.__class__.__name__],
                'Sampler': [sampler.__name__],
                'Sampling Strategy': [strategy],
                'F1 Score': [f1]
            })], ignore_index=True)

            # Afficher le message de progression
            tqdm.write(f"Model: {model.__class__.__name__}, Sampler: {sampler.__name__}, Strategy: {strategy}, F1 Score: {f1}")

# Afficher le tableau récapitulatif
print(results_df_no_sampling_6)
results_df_no_sampling_6.to_pickle('dataframe_recap_no_sampling_6.pkl')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  results_df_no_sampling_6 = pd.concat([results_df_no_sampling_6, pd.DataFrame({


Model: LogisticRegression, Sampler: RandomOverSampler, Strategy: 0.05, F1 Score: 0.0075392038600723766


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression, Sampler: RandomOverSampler, Strategy: 0.1, F1 Score: 0.0075392038600723766


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression, Sampler: SMOTE, Strategy: 0.05, F1 Score: 0.0075392038600723766


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression, Sampler: SMOTE, Strategy: 0.1, F1 Score: 0.0075392038600723766


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression, Sampler: RandomUnderSampler, Strategy: 0.05, F1 Score: 0.0075392038600723766


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression, Sampler: RandomUnderSampler, Strategy: 0.1, F1 Score: 0.0075392038600723766


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression, Sampler: NearMiss, Strategy: 0.05, F1 Score: 0.0075392038600723766


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression, Sampler: NearMiss, Strategy: 0.1, F1 Score: 0.0075392038600723766
                Model             Sampler  Sampling Strategy  F1 Score
0  LogisticRegression   RandomOverSampler               0.05  0.007539
1  LogisticRegression   RandomOverSampler               0.10  0.007539
2  LogisticRegression               SMOTE               0.05  0.007539
3  LogisticRegression               SMOTE               0.10  0.007539
4  LogisticRegression  RandomUnderSampler               0.05  0.007539
5  LogisticRegression  RandomUnderSampler               0.10  0.007539
6  LogisticRegression            NearMiss               0.05  0.007539
7  LogisticRegression            NearMiss               0.10  0.007539
