In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

#pd.set_option('display.max_columns', None)

In [3]:
df_final = pd.read_csv('df_final_before_preprocessing.csv')

# Separation train and test data

In [15]:
df_final.TARGET.value_counts()

0    282682
1     24825
Name: TARGET, dtype: int64

In [16]:
df_final.TARGET.value_counts()/df.shape[0]

0    0.91927
1    0.08073
Name: TARGET, dtype: float64

In [17]:
target = df_final.TARGET
inputs = df_final.drop(columns=['TARGET','SK_ID_CURR'])
inputs_with_ids = df_final.drop(columns=['TARGET'])

In [18]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.3, random_state=1)

In [19]:
X_train.shape, X_test.shape

((215254, 339), (92253, 339))

In [20]:
index_test = X_test.index.tolist()
X_test_with_id = inputs_with_ids.loc[index_test]

In [21]:
X_test_with_id.to_csv('CustomerDataToBePredicted.csv',index=False)

# Preprocessing

In [22]:
columns_int = list(X_train.select_dtypes(include=['uint8', 'int64']).columns)
columns_float = list(X_train.select_dtypes(include=['float64']).columns)

In [23]:
X_train_preprocess = X_train.copy()
X_test_preprocess = X_test.copy()

In [24]:
from sklearn.impute import SimpleImputer

imp_most_frequent = SimpleImputer(strategy="most_frequent")
imp_mean = SimpleImputer(strategy="mean")
imp_most_frequent.fit(X_train_preprocess[columns_int])
imp_mean.fit(X_train_preprocess[columns_float])

X_train_preprocess[columns_int] = imp_most_frequent.transform(X_train_preprocess[columns_int])
X_train_preprocess[columns_float] = imp_mean.transform(X_train_preprocess[columns_float])
X_test_preprocess[columns_int] = imp_most_frequent.transform(X_test_preprocess[columns_int])
X_test_preprocess[columns_float] = imp_mean.transform(X_test_preprocess[columns_float])

In [25]:
from sklearn.preprocessing import RobustScaler

rs_scaler = RobustScaler()
rs_scaler.fit(X_train_preprocess[columns_float])

X_train_preprocess[columns_float] = rs_scaler.transform(X_train_preprocess[columns_float])
X_test_preprocess[columns_float] = rs_scaler.transform(X_test_preprocess[columns_float])

In [26]:
X_train_preprocess.shape, X_test_preprocess.shape

((215254, 339), (92253, 339))

## Pipeline preprocessing

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Pipeline data transformation (Imputation / Scaling):
def Preprocessing (numeric,categoric):
    numeric_trans = [('imputer',SimpleImputer(strategy= 'mean')),('scaler', RobustScaler())]
    numeric_pipeline = Pipeline(numeric_trans)
    categoric_trans = [('imputer',SimpleImputer(strategy="most_frequent"))]
    categoric_pipeline = Pipeline(categoric_trans)
    all_trans = [("numeric",numeric_pipeline,numeric),("categorical",categoric_pipeline,categoric)]
    preprocessor = ColumnTransformer(all_trans, remainder='passthrough')
    return preprocessor

In [28]:
# Data Transformed
preprocessor_fitted = Preprocessing(columns_float,columns_int).fit(X_train)

In [29]:
X_train_preprocess = preprocessor_fitted.transform(X_train)
X_test_preprocess = preprocessor_fitted.transform(X_test)

In [30]:
X_train_preprocess.shape, X_test_preprocess.shape

((215254, 339), (92253, 339))

# Resampling

Links: https://towardsdatascience.com/imbalanced-classification-in-python-smote-tomek-links-method-6e48dfe69bbc

#### from imblearn.under_sampling import RandomUnderSampler
- Le sous-échantillonnage aléatoire (random undersampling) des observations majoritaires: on retire aléatoirement des observations majoritaires

#### from imblearn.under_sampling import TomekLinks
- Le sous-échantillonnage synthétique (TomekLinks) des observations majoritaires: on retire des observations majoritaires, ressemblantes à des minoritaires (des majoritaires ambigues)

#### from imblearn.over_sampling import RandomOverSampler
- Le sur-échantillonnage aléatoire (random oversampling) des observations minoritaires : on tire au hasard des individus minoritaires (on les dupplique) que l’on rajoute aux données.

#### from imblearn.over_sampling import SMOTE
- Le sur-échantillonnage synthétique (SMOTE pour Synthetic Minority Oversampling Technique) produit des observations minoritaires ressemblantes mais distinctes de celles déjà existantes.

- Nous allons combiner SMOTE avec RandomUnderSampler pour augmenter l'efficacité de la gestion de la classe déséquilibrée.
J'utilise pas Tomek car cela prend beaucoup trop de temps par rapport à la valeur ajoutée.

Doc: https://imbalanced-learn.org/dev/references/generated/imblearn.combine.SMOTETomek.html

In [31]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
#from imblearn.under_sampling import TomekLinks
#from imblearn.combine import SMOTETomek

In [32]:
## Over-sampling
smote = SMOTE(sampling_strategy=0.4)

## Under-sampling 
#tomek = TomekLinks(sampling_strategy='majority')
rus = RandomUnderSampler(sampling_strategy=0.6)

## Over-sampling then under-sampling
#smtomek = SMOTETomek(sampling_strategy=0.2, tomek=TomekLinks(sampling_strategy=0.4))

In [33]:
y_train.value_counts()

0    197857
1     17397
Name: TARGET, dtype: int64

In [34]:
%%time
X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocess, y_train)

CPU times: user 15.9 s, sys: 13.4 s, total: 29.3 s
Wall time: 9.83 s


In [35]:
y_train_smote.value_counts()

0    197857
1     79142
Name: TARGET, dtype: int64

In [36]:
y_train_smote.value_counts()/y_train_smote.shape

0    0.714288
1    0.285712
Name: TARGET, dtype: float64

In [37]:
%%time
X_train_rebalanced, y_train_rebalanced = rus.fit_resample(X_train_smote, y_train_smote)

CPU times: user 160 ms, sys: 428 ms, total: 588 ms
Wall time: 594 ms


In [38]:
y_train_rebalanced.value_counts()

0    131903
1     79142
Name: TARGET, dtype: int64

In [39]:
y_train_rebalanced.value_counts()/y_train_rebalanced.shape

0    0.624999
1    0.375001
Name: TARGET, dtype: float64

# Modelling and optimization

- Score auc 
- Métrique métier (f beta score)

In [40]:
## Modelisation
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

## scoring the grid search on the train
# roc_auc par défault
from sklearn.metrics import  make_scorer
from sklearn.metrics import fbeta_score

# Scoring the test (y_true, y_pred and y proba for roc_auc_score)
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
#from sklearn.metrics import fbeta_score

## Baseline model

In [41]:
y_test.value_counts()

0    84825
1     7428
Name: TARGET, dtype: int64

In [42]:
y_train_rebalanced.value_counts()

0    131903
1     79142
Name: TARGET, dtype: int64

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

# Baseline model
baseline_model = DummyClassifier(strategy="uniform")
baseline_model.fit(X_train_rebalanced, y_train_rebalanced)
print(f"Score : {baseline_model.score(X_test, y_test)}")

Score : 0.5004498498693809


## Random Forest Classifier

In [45]:
def optim_model(model, param_grid, scoring, X, y):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=5)
    grid_search.fit(X, y)
    return grid_search.best_score_, grid_search.best_params_, grid_search.best_estimator_

In [46]:
# Scoring
scorers = {'roc_auc': 'roc_auc', 'fbeta_score': make_scorer(fbeta_score, beta=2)}

In [47]:
# Model
rfc_model = RandomForestClassifier(random_state=0)

# Hyperparameters Grid
n_estimators = [100, 130, 160]
criterions = ['gini', 'entropy']
max_depth = range(2,5)
rfc_parameters = {'n_estimators':n_estimators, 'criterion': criterions, 'max_depth':max_depth}

In [48]:
## Optimisation sur le score auc
# rf_score_auc, rf_param_auc, rf_estimator_auc = optim_model(model=rfc_model, param_grid=rfc_parameters, scoring=scorers['roc_auc'],
#                                                            X=X_train_smtomek, y=y_train_smtomek)

- CPU times: user 2h 6min 55s, sys: 1min 58s, total: 2h 8min 53s
- Wall time: 2h 8min 55s
    
- Score auc: 0.9349012560822322
- Paramètres: 'criterion': 'entropy', 'max_depth': 4, 'n_estimators': 130
- Estimateur optimale au niveau du score auc: RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=130, random_state=0))

In [49]:
## Score f beta
#rf_score_beta, rf_param_beta, rf_estimator_beta = optim_model(model=rfc_model, param_grid=rfc_parameters, scoring=scorers['fbeta_score'],
#                                                  X=X_train_smtomek, y=y_train_smtomek)

- CPU times: user 2h 7min 53s, sys: 1min 56s, total: 2h 9min 49s
- Wall time: 2h 9min 51s
    
- Score f beta: 0.8575190702089286
- Paramètres: 'criterion': 'entropy', 'max_depth': 4, 'n_estimators': 100
- Estimateur optimale au niveau du score f beta: RandomForestClassifier(criterion='entropy', max_depth=4, random_state=0))

## Light Gbm

In [50]:
lgbm_model = lgb.LGBMClassifier(random_state=1)

lgbm_parameters = {'num_leaves':[20, 25, 30], 'max_depth': range(2,5), 'learning_rate': [0.1, 0.3, 0.5], 'n_estimators': [100, 500, 1000]}

In [52]:
import re

#X_train_rebalanced = X_train_rebalanced.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [53]:
## Score auc
# lgbm_score_auc, lgbm_param_auc, lgbm_estimator_auc = optim_model(model=lgbm_model, param_grid=lgbm_parameters, scoring=scorers['roc_auc'],
#                                                                 X=X_train_rebalanced, y=y_train_rebalanced)

- CPU times: user 1d 11h 54min 24s, sys: 8min 47s, total: 1d 12h 3min 11s
- Wall time: 5h 9min 47s

- Score auc: 0.9675222944825889
- Paramètres: 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'num_leaves': 20
- Estimateur optimale au niveau du score auc: LGBMClassifier(max_depth=4, num_leaves=20, random_state=1)

In [54]:
## Score f beta
# lgbm_score_beta, lgbm_param_beta, lgbm_estimator_beta = optim_model(model=lgbm_model, param_grid=lgbm_parameters, scoring=scorers['fbeta_score'], 
#                                                                    X=X_train_rebalanced, y=y_train_rebalanced)

- CPU times: user 1d 12h 47min 9s, sys: 8min 31s, total: 1d 12h 55min 41s
- Wall time: 5h 16min 28s

- Score f beta: 0.9599934906823723,
- Paramètres: 'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 1000, 'num_leaves': 30
- Estimateur optimale au niveau du score f beta: LGBMClassifier(max_depth=4, num_leaves=20, random_state=1)

## Comparaison des modèles et performance

- La mesure du score AUC de chaque split de la cross validation du fichier train permet de vérifier la stabilité de ce score et donc la généralisation du modèle
- Afficher en parallèle l’AUC du fichier test en tant que contrôle de cohérence, afin de mettre en évidence un éventuel problème de stratification dans le train_test_split ou bien une éventuelle dérive de type data leakage au niveau du fichier train “cross-validé” (par exemple Smote sur l’ensemble du fichier train avant cross validation)

In [55]:
n_classes = 2
n_samples = y_train.shape[0]
n_samples0 = y_train.value_counts().loc[0]
n_samples1 = y_train.value_counts().loc[1]
w0 = n_samples / (n_classes * n_samples0)
w1 = n_samples / (n_classes * n_samples1)
w0, w1

(0.5439635696487868, 6.186526412599873)

In [56]:
ratio_01 = n_samples0 / n_samples1
ratio_01

11.373052825199746

In [57]:
lgbm_best_model = lgb.LGBMClassifier(learning_rate=0.5, max_depth=3, n_estimators=1000,
                                     num_leaves=30, random_state=2, scale_pos_weight=12)

- Cross validation pour vérifier la stabilité

In [58]:
from sklearn.model_selection import cross_val_score

def eval_model(model, X, y, scoring):
    score = cross_val_score(model, X, y, scoring=scoring,cv=5)
    return score

In [59]:
scoring_metrics = ['roc_auc', make_scorer(fbeta_score, beta=2)]

In [60]:
%%time
score_roc_auc = eval_model(lgbm_best_model, X_train_rebalanced, y_train_rebalanced, scoring_metrics[0])

CPU times: user 28min 56s, sys: 2.27 s, total: 28min 58s
Wall time: 4min 4s


In [61]:
score_roc_auc

array([0.66542871, 0.97361872, 0.99995112, 0.99993141, 0.99974142])

In [62]:
%%time
score_fbeta = eval_model(lgbm_best_model, X_train_rebalanced, y_train_rebalanced, scoring_metrics[1])

CPU times: user 29min, sys: 2.43 s, total: 29min 2s
Wall time: 4min 4s


In [63]:
score_fbeta

array([0.03373724, 0.88462207, 0.90371352, 0.90301786, 0.90227107])

- Scores du fichier test pour contrôler la cohérence

In [64]:
%%time
lgbm_best_model.fit(X_train_rebalanced, y_train_rebalanced)

CPU times: user 5min 59s, sys: 313 ms, total: 6min
Wall time: 49.5 s


LGBMClassifier(learning_rate=0.5, max_depth=3, n_estimators=1000, num_leaves=30,
               random_state=2, scale_pos_weight=12)

In [65]:
def prediction(model, X, y):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)
    print('Test accuracy score:', model.score(X, y))
    ras = roc_auc_score(y, y_prob[:,1])
    print('Test roc auc score:', ras)
    ps = precision_score(y, y_pred)
    print('Test precision score:', ps)
    rs = recall_score(y, y_pred)
    print('Test recall score:', rs)
    f1s = f1_score(y, y_pred)
    print('Test f1 score:', f1s)
    f2s = fbeta_score(y, y_pred, beta=2)
    print('Test f2 score:', f2s)
    #auc_score = roc_auc_score(y, y_prob[:,1])
    #print('Test auc score :', auc_score)
    return y_pred,y_prob

In [82]:
y_pred_lgbm, y_proba_lgbm = prediction(lgbm_best_model, X_test_preprocess, y_test)

Test accuracy score: 0.7113047814163225
Test roc auc score: 0.7399228304464782
Test precision score: 0.1656394721264668
Test recall score: 0.6404146472805601
Test f1 score: 0.2632030320635184
Test f2 score: 0.40706132018962543


## Save preprocessor and model

In [67]:
import pickle

filename = 'preprocessor.pkl'
pickle.dump(preprocessor_fitted, open(filename, 'wb'))

filename = 'lgbm_model.pkl'
pickle.dump(lgbm_best_model, open(filename, 'wb'))