In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

In [3]:
data_train=pd.read_csv('train.csv')
data_test= pd.read_csv('test.csv')
df=data_train.copy()
df = df.drop('Loan_ID', axis=1)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
trainset , testset = train_test_split(df , test_size=0.2, random_state=6)

In [6]:
def encodage(df):
    code = {'Male':1,
        'Female':0,
        'No':0,
        'Yes':1,
        '0':0,
        '1':1,
        '2':2,
        '3+':3,
        'Graduate':1,
        'Not Graduate':0,
        'Rural':0,
        'Semiurban':0.5,
        'Urban':1,
        'Y':1,
        'N':0,
        }
    
    for col in df.select_dtypes('object').columns:
        df[col] = df[col].map(code)
        
    return df

def outlier_treatment(df):
    
    df["ApplicantIncome"][df["ApplicantIncome"]>20000]=20000
    df["CoapplicantIncome"][df["CoapplicantIncome"]>10000]=10000
    index=0
    index_loc=[]
    for i in df["LoanAmount"]:
    
        if (i>420):
            index_loc.append(index)
        index=index+1
    df=df.drop(index_loc,0)
    return df

def imputation(df):
    #df.dropna(axis=0)
    imputer=KNNImputer(n_neighbors=20)
    df_filled=imputer.fit_transform(df[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']])
    df=pd.DataFrame(df_filled,columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'])
    return df

def feature_engineering(df):
      
    df.loc[:,'TotalIncome']=df.loc[:,'CoapplicantIncome']+df.loc[:,'ApplicantIncome']
    df.loc[:,"LoanAmountPerMonth"]=df.loc[:,"LoanAmount"]/df.loc[:,"Loan_Amount_Term"]
    df.loc[:,"Balance_Income_Applicant"] = df.loc[:,"ApplicantIncome"]-df.loc[:,"LoanAmountPerMonth"]*1000
    df.loc[:,"Balance_Income_Total"] = df.loc[:,"TotalIncome"]-df.loc[:,"LoanAmountPerMonth"]*1000
    df.loc[:,'ratio_LoanAmountPerMonth_ApplicantIncome']=(df.loc[:,"LoanAmountPerMonth"]*1000)/df.loc[:,"ApplicantIncome"]
   
    return df

In [7]:
def preprocessing(df):
    
    df = encodage(df)
    df = imputation(df)
    df = outlier_treatment(df)
    df = feature_engineering(df)
    
    X = df.drop('Loan_Status', axis=1)
    y = df['Loan_Status']
    
    print(y.value_counts())
    
    return X, y

In [8]:
X_train , y_train = preprocessing(trainset)

1.0    323
0.0    157
Name: Loan_Status, dtype: int64


In [9]:
X_test, y_test = preprocessing(testset)

1.0    90
0.0    31
Name: Loan_Status, dtype: int64


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report, recall_score, precision_score
from sklearn.model_selection import learning_curve,ShuffleSplit
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif,f_regression,chi2
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures,StandardScaler,MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score

In [11]:
preprocessor = make_pipeline(MinMaxScaler(), SelectKBest(f_classif, k=8))

In [12]:
RandomForest = make_pipeline(preprocessor, RandomForestClassifier(random_state=0))
AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(n_estimators=100, random_state=0))
SVM = make_pipeline(preprocessor, SVC(random_state=0))
KNN = make_pipeline(preprocessor, KNeighborsClassifier())
Logistic = make_pipeline(preprocessor,LogisticRegression())
Bagging=make_pipeline(preprocessor,BaggingClassifier(base_estimator=KNeighborsClassifier(),n_estimators=100))
DecisionTree=make_pipeline(preprocessor,DecisionTreeClassifier(random_state=0))
LogisticReg = make_pipeline(preprocessor,LogisticRegression())

In [13]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [14]:
def find_best_model_using_gridsearchcv(X,y):
    
    algos = {
        'DecisionTree': {
            'model': DecisionTree,
            'params': {
                'decisiontreeclassifier__criterion' : ['gini', 'entropy'],
                'decisiontreeclassifier__splitter': ['best','random'],
                'pipeline__selectkbest__k': range(4, 15)
            },
            'Gridsearchcv' : True
        },
         'SVM': {
            'model': SVM,
            'params': {
                'svc__gamma':['scale', 'auto', 0.0005],
                'svc__C':[1, 10],
                'svc__kernel' : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
                'svc__degree':[1, 2,],
                'pipeline__selectkbest__k': range(4, 15)
            },
            'Gridsearchcv' : False
        },
        'KNN': {
            'model': KNN,
            'params': { 
              'kneighborsclassifier__n_neighbors': np.arange(1,20),
              'kneighborsclassifier__weights':['uniform', 'distance'],
              'kneighborsclassifier__algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'kneighborsclassifier__metric': ['euclidean','manhattan','minkowski'],
              'pipeline__selectkbest__k': range(4, 15)
            },
            'Gridsearchcv' : False  
        },
        
        'LogisticRegression' : {
            'model': LogisticReg,
            'params': {
                'logisticregression__penalty':['l1', 'l2', 'elasticnet', 'none'],
                'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                'pipeline__selectkbest__k': range(4, 15)
            },
            'Gridsearchcv' : True
        },        

         'RandomForest': {
            'model': RandomForest,
            'params': {
            'randomforestclassifier__max_depth':[i for i in range(2,11,2)],
              'randomforestclassifier__min_samples_leaf':[5,7,10],
              'randomforestclassifier__n_estimators':[10,50,250],
             'pipeline__selectkbest__k': range(4, 15)
                      },
            'Gridsearchcv' : False
             },
             
       'Bagging' : {
            'model': Bagging,
            'params': {
                'baggingclassifier__base_estimator': [DecisionTreeClassifier(),
                                                [KNeighborsClassifier(n_neighbors=i) for i in range(2,20,2)],
                                                      SVC(random_state=0),LogisticRegression()],
            
                'pipeline__selectkbest__k': range(4, 15)
                      },
            'Gridsearchcv' : False
        },
        'Boosting' : {
            'model': AdaBoost,
            'params': {
                'adaboostclassifier__base_estimator': [DecisionTreeClassifier(),
                                                [KNeighborsClassifier(n_neighbors=i) for i in range(2,20,2)],
                                                      SVC(random_state=0),LogisticRegression()],
            
                'pipeline__selectkbest__k': range(4, 15)
                      },
            'Gridsearchcv' : False                
        }
    }
    scores = []
    best_model = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        if config['Gridsearchcv']==1:
            gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
            gs.fit(X,y)
        else:
            gs =  RandomizedSearchCV(config['model'], config['params'], cv=cv,n_iter=40, return_train_score=False)
            gs.fit(X,y)
            
        y_pred=gs.best_estimator_.predict(X_test)
        f1=f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precission = precision_score(y_test, y_pred)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_,
            'best_f1_score': f1,
            'best_recall_score': recall,
            'best_precision_score': precission,    
        })
        
        best_model.append(gs.best_estimator_)

    return (pd.DataFrame(scores,columns=['model','best_score','best_params','best_f1_score',
            'best_recall_score','best_precision_score']) , best_model)

tableau , best_model = find_best_model_using_gridsearchcv(X_train,y_train)
tableau

Unnamed: 0,model,best_score,best_params,best_f1_score,best_recall_score,best_precision_score
0,DecisionTree,0.716667,"{'decisiontreeclassifier__criterion': 'gini', ...",0.816568,0.766667,0.873418
1,SVM,0.78125,"{'svc__kernel': 'sigmoid', 'svc__gamma': 'auto...",0.908163,0.988889,0.839623
2,KNN,0.777083,"{'pipeline__selectkbest__k': 5, 'kneighborscla...",0.908163,0.988889,0.839623
3,LogisticRegression,0.783333,"{'logisticregression__penalty': 'l1', 'logisti...",0.908163,0.988889,0.839623
4,RandomForest,0.779167,"{'randomforestclassifier__n_estimators': 250, ...",0.908163,0.988889,0.839623
5,Bagging,0.783333,"{'pipeline__selectkbest__k': 9, 'baggingclassi...",0.908163,0.988889,0.839623
6,Boosting,0.783333,"{'pipeline__selectkbest__k': 4, 'adaboostclass...",0.908163,0.988889,0.839623


## Staking

In [15]:
from sklearn.ensemble import StackingClassifier

In [16]:
Stacking = StackingClassifier([(f'model{i+1}',best_model[i]) for i in range(0,tableau.shape[0])],
                              final_estimator=LogisticRegression())

In [41]:
Stacking.fit(X_train,y_train)
Stacking.score(X_train,y_train)

0.80625

In [42]:
Stacking.score(X_test,y_test)

0.8512396694214877