In [1]:
import pickle
import pandas as pd
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, train_test_split
import xgboost as xgb
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours, NearMiss, ClusterCentroids
from sklearn.metrics import f1_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline, make_pipeline

In [2]:
with open('dataframe.pkl', 'rb') as file:
    df = pickle.load(file)

## Apprentissage et test ##

In [3]:
# Définition des dates limites pour les ensembles d'apprentissage et de test
train_inf = '2017-02-01'
train_sup = '2017-08-31'
test_inf = '2017-09-01'
test_sup = '2017-11-30'

Apprentissage:

In [4]:
train = df.loc[(df['DateTransaction'] >= train_inf) & (df['DateTransaction'] <= train_sup)]

In [5]:
x_train = train.drop(columns=['FlagImpaye'])
y_train = train['FlagImpaye']

Test:

In [6]:
test = df.loc[(df['DateTransaction'] >= test_inf) & (df['DateTransaction'] <= test_sup)]

In [7]:
x_test = test.drop(columns=['FlagImpaye'])
y_test = test['FlagImpaye']

## Gridsearch avec les différents modèles suppervisés ##

On enlève la colonne DateTransaction pour cette méthode car le type de données ne correspond pas

In [8]:
x_train = x_train.drop(columns=['DateTransaction'])
x_test = x_test.drop(columns=['DateTransaction'])

### Tomek Link

In [9]:
# Modèle de l'arbre de décision
gb_classifier = GradientBoostingClassifier()

In [10]:
#Tomek Link
tl = TomekLinks()
x_train_tl, y_train_tl = tl.fit_resample(x_train, y_train)

In [None]:
# Stratégie de validation croisée temporelle
tscv = TimeSeriesSplit(n_splits=5)

In [None]:
# Pipeline
pipeline = Pipeline([
    ('tomeklink', tl),
    ('classifier', gb_classifier)
])

In [2]:
# Paramètres à optimiser
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [10, 15],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [2, 4]
}

In [None]:
# GridSearchCV avec validation croisée temporelle
grid_search = GridSearchCV(pipeline, param_grid, scoring='f1', cv=tscv)
grid_search.fit(x_train_tl, y_train_tl)