In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import  LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import seaborn as sns
from sklearn.metrics import accuracy_score
from pprint import pprint

from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
df = pd.read_csv("C:\\Users\\Mehdi\\PycharmProjects\\BackToCoding\\Machine Learning\\dataframes\\hmeq.csv")

In [8]:
df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,


In [9]:
for column in df.columns:
    if df[column].nunique() == 0 or df[column].nunique() == 1:
        df.drop(columns=column,axis=1,inplace=True)
        print(f'{column} - supprimée')

    else:
        print(f'{column}', '- modalités :', df[column].nunique(), '- aperçu :', df[column].unique()[:10], '\n')

BAD - modalités : 2 - aperçu : [1 0] 

LOAN - modalités : 540 - aperçu : [1100 1300 1500 1700 1800 2000 2100 2200 2300 2400] 

MORTDUE - modalités : 5053 - aperçu : [25860. 70053. 13500.    nan 97800. 30548. 48649. 28502. 32700. 22608.] 

VALUE - modalités : 5381 - aperçu : [ 39025.  68400.  16700.     nan 112000.  40320.  57037.  43034.  46740.
  62250.] 

REASON - modalités : 2 - aperçu : ['HomeImp' nan 'DebtCon'] 

JOB - modalités : 6 - aperçu : ['Other' nan 'Office' 'Sales' 'Mgr' 'ProfExe' 'Self'] 

YOJ - modalités : 99 - aperçu : [10.5  7.   4.   nan  3.   9.   5.  11.  16.  18. ] 

DEROG - modalités : 11 - aperçu : [ 0. nan  3.  2.  1.  4.  5.  6.  7.  8.] 

DELINQ - modalités : 14 - aperçu : [ 0.  2. nan  1.  6. 15.  4.  3.  5.  7.] 

CLAGE - modalités : 5314 - aperçu : [ 94.36666667 121.83333333 149.46666667          nan  93.33333333
 101.46600191  77.1         88.76602988 216.93333333 115.8       ] 

NINQ - modalités : 16 - aperçu : [ 1.  0. nan  2.  3.  5. 14. 10.  4.  9.] 



In [10]:
variables_type = {}

for types in df.dtypes:
    variables_type[str(types)] = []

In [11]:
for column in df.columns:
    
    if df[column].nunique() == 1 or df[column].nunique() == 0:
        df.drop(columns=column,axis=1,inplace=True)
        print('La colonne', column, 'a été supprimée.' + '\n')

    else:

        variables_type[str(df[column].dtype)].append(column)
        print('La variable', column, 'possède', df[column].nunique(), 'modalités :', df[column].unique()[:15],'\n')

La variable BAD possède 2 modalités : [1 0] 

La variable LOAN possède 540 modalités : [1100 1300 1500 1700 1800 2000 2100 2200 2300 2400 2500 2800 2900 3000
 3100] 

La variable MORTDUE possède 5053 modalités : [25860. 70053. 13500.    nan 97800. 30548. 48649. 28502. 32700. 22608.
 20627. 45000. 64536. 71000. 24280.] 

La variable VALUE possède 5381 modalités : [ 39025.  68400.  16700.     nan 112000.  40320.  57037.  43034.  46740.
  62250.  29800.  55000.  87400.  83850.  34687.] 

La variable REASON possède 2 modalités : ['HomeImp' nan 'DebtCon'] 

La variable JOB possède 6 modalités : ['Other' nan 'Office' 'Sales' 'Mgr' 'ProfExe' 'Self'] 

La variable YOJ possède 99 modalités : [10.5  7.   4.   nan  3.   9.   5.  11.  16.  18.   2.5  8.  19.   4.5
  2. ] 

La variable DEROG possède 11 modalités : [ 0. nan  3.  2.  1.  4.  5.  6.  7.  8.  9. 10.] 

La variable DELINQ possède 14 modalités : [ 0.  2. nan  1.  6. 15.  4.  3.  5.  7.  8. 10. 12. 11. 13.] 

La variable CLAGE possède 531

In [12]:
variables_type.items()

dict_items([('int64', ['BAD', 'LOAN']), ('float64', ['MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']), ('object', ['REASON', 'JOB'])])

In [13]:
variables_type['int64'].remove('BAD')

In [14]:
variables_type['int64']

['LOAN']

In [15]:
numerical_features = variables_type['int64'] + variables_type['float64']

In [16]:
categorical_features = variables_type['object']

## Adding custom transformers

In [177]:
class ModeImputer(BaseEstimator, TransformerMixin):

    def __init__(self,strategy):

        self.strategy = strategy

    def fit(self,X, y=None):
        return self

    def transform(self,X,y=None):
        for elements in categorical_features:
            if self.strategy == 'simple_mode':
                X[elements].fillna(X[elements].mode()[0],inplace=True)
            if self.strategy == 'target_mode':
                X[elements].fillna(df.groupby('BAD')[elements].transform(lambda x: x.mode()[0]),inplace=True)
        return X

class NumericImputer(BaseEstimator, TransformerMixin):

    def __init__(self, strategy):

        self.strategy = strategy
        

    def fit(self, X,y=None):
        return self

    def transform(self,X,y=None):
        for elements in numerical_features:
            if self.strategy == 'mean':
                X[elements].fillna(X[elements].mean(),inplace=True)
                
            if self.strategy == 'target_mean':
                X[elements].fillna(df.groupby('BAD')[elements].transform('mean'),inplace=True)
                
            if self.strategy == 'median':
                X[elements].fillna(X[elements].median(),inplace=True)
            
            if self.strategy == 'target_median':
                X[elements].fillna(df.groupby('BAD')[elements].transform('median'),inplace=True)
        return X

        
        
        

In [178]:
numerical_transformer = Pipeline(steps=[

    ('imputer', NumericImputer(strategy='mean')),
    ('scaler', StandardScaler())
    
])

categorical_transformer = Pipeline(steps=[

    ('imputer', ModeImputer(strategy='simple_mode')),
    ('ohe', OneHotEncoder())
    
])

preprocessor = ColumnTransformer(transformers=[

    ('numeric', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
                                 
])

classifiers = [
    
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Random Forest', RandomForestClassifier())
    
]

params_grid = {

    'Logistic Regression' : {'preprocessor__numeric__imputer__strategy' : ['mean', 'target_mean', 'median', 'target_median'], 'preprocessor__cat__imputer__strategy' : ['simple_mode','target_mode'],'classifiers__C' : [0.25,0.5,0.75,1]},
    'Random Forest' : {'preprocessor__numeric__imputer__strategy' : ['mean', 'target_mean', 'median', 'target_median'], 'preprocessor__cat__imputer__strategy' : ['simple_mode','target_mode'], 'classifiers__max_depth' : [50,75,100]}
    
}

In [179]:
X = df.drop('BAD',axis=1)
y = df['BAD']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)


results = {}
predictions = {}

for classifier_name, classifier in tqdm(classifiers):

    pipeline = Pipeline([

    ('preprocessor', preprocessor),
    ('classifiers', classifier)
        
    ])

    model = GridSearchCV(pipeline, params_grid[classifier_name])
    model = model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    predictions[classifier_name] = y_pred

    results[classifier_name] = {'Best params' : model.best_params_, 'Accuracy on train' : model.best_score_, 
                                'Accuracy on test': accuracy_score(y_test, y_pred), 'Best model' : model.best_estimator_}


100%|██████████| 2/2 [02:05<00:00, 62.58s/it]


In [180]:
results['Logistic Regression']['Accuracy on test']

0.8565436241610739

In [181]:
results['Random Forest']['Accuracy on test']

0.9639261744966443

In [184]:
results['Logistic Regression']

{'Best params': {'classifiers__C': 1,
  'preprocessor__cat__imputer__strategy': 'simple_mode',
  'preprocessor__numeric__imputer__strategy': 'target_mean'},
 'Accuracy on train': 0.8708069628955016,
 'Accuracy on test': 0.8565436241610739,
 'Best model': Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('numeric',
                                                   Pipeline(steps=[('imputer',
                                                                    NumericImputer(strategy='target_mean')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   ['LOAN', 'MORTDUE', 'VALUE',
                                                    'YOJ', 'DEROG', 'DELINQ',
                                                    'CLAGE', 'NINQ', 'CLNO',
                                                    'DEBTINC

In [186]:
from sklearn.metrics import classification_report

print(classification_report(y_test,predictions['Logistic Regression']))

              precision    recall  f1-score   support

           0       0.87      0.97      0.91       946
           1       0.77      0.44      0.56       246

    accuracy                           0.86      1192
   macro avg       0.82      0.70      0.74      1192
weighted avg       0.85      0.86      0.84      1192

