### TO DO

    feature importance
    
    Feature selection
    
    

In [215]:
%matplotlib inline
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, Normalizer, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.compose import make_column_transformer
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE 
from sklearn.feature_selection import GenericUnivariateSelect, chi2, SelectFpr
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [219]:
train = pd.read_csv("data/final_loan_train.csv")
X_all = train.drop(columns=["loan_id","loan_success"])
Y_all = train["loan_success"]


categorical_cols = [col for col in X_all.columns if X_all[col].dtype == object]
scalar_cols = [col for col in X_all.columns if X_all[col].dtype != object]

scaler = make_column_transformer((StandardScaler(), scalar_cols),
                                 (OneHotEncoder(), categorical_cols))

scaler.fit_transform(X_all)
X_all = scaler.transform(X_all)


split_size = 3


In [210]:
def get_best_model(X, Y, models):
    best = None
    best_auc = 0
    best_params = {}
    for model, params in models.items():
        print("Model: "+model.__class__.__name__, end="")

        pipeline = Pipeline([("smote", SMOTE(random_state=42)),
                             #("gus", SelectFpr(chi2)),
                             ("m", model)])
        kfold = StratifiedKFold(n_splits=4, shuffle=True)
        search = GridSearchCV(pipeline, params, n_jobs=-1 , scoring="roc_auc", cv=kfold).fit(X,Y)
        auc = search.best_score_

        print("\t auc: "+str(auc))
        if auc > best_auc:
            best, best_auc = model, auc
            best_params = { key.replace("m__", ""): value for key, value in search.best_params_.items()}
    return best, best_params , best_auc

In [118]:
models = {}

## KNN

In [119]:
models[KNeighborsClassifier()] = {}

## Log Reg

In [120]:
models[LogisticRegression()] = {
    "m__max_iter":[1000]
}

## SVM

In [164]:
models[SVC()] = {
    "m__kernel":["linear","poly", "rbf", "sigmoid"],
    "m__probability":[True]
}

## Naive Bayes

In [122]:
models[GaussianNB()] = {}

## XBoost

In [123]:
models[GradientBoostingClassifier()] = {}

## Bagging

In [124]:
models[BaggingClassifier()] = {}

## Decision Tree

In [125]:
models[DecisionTreeClassifier()] = {}

## Random Forest

In [126]:
models[RandomForestClassifier()] = {
    "m__n_estimators":[10, 100, 200, 1000],
    "m__criterion":["gini", "entropy"],
    "m__max_features":["auto", "sqrt", "log2"],
    "m__class_weight":["balanced", "balanced_subsample"]
}

## Get best model

In [217]:
model, best_params , auc = get_best_model(X_all,Y_all, models)
exec("model = "+model.__class__.__name__+"(**best_params)")
model, auc

Model: KNeighborsClassifier	 auc: 0.5676784189988415
Model: LogisticRegression	 auc: 0.604442183403451
Model: SVC	 auc: 0.5632430949332358
Model: GaussianNB	 auc: 0.533807199256143
Model: GradientBoostingClassifier	 auc: 0.6651263642460825
Model: BaggingClassifier	 auc: 0.7046504405219194
Model: DecisionTreeClassifier	 auc: 0.6205536247789769
Model: RandomForestClassifier	 auc: 0.7455543869276263
Model: SVC	 auc: 0.5653755868544601


(RandomForestClassifier(class_weight='balanced_subsample', max_features='sqrt'),
 0.7455543869276263)

## Train model

In [183]:
modelsv = model

In [222]:
sm = SMOTE(random_state=42)
X_train, Y_train = sm.fit_resample(X_all, Y_all)

#sfpr = SelectFpr(chi2)
#sfpr.fit(X_train, Y_train)
#X_train = sfpr.transform(X_train)


model.fit(X_train, Y_train)

RandomForestClassifier(class_weight='balanced_subsample', max_features='sqrt')

# Save result

In [224]:
def saveModel(model):
    test = pd.read_csv("data/final_loan_test.csv")
    X = test.drop(columns=["loan_id","loan_success"])
    scaler.fit(X)
    X = scaler.transform(X)
    #X = sfpr.transform(X)
    Y = model.predict_proba(X)
    test["loan_success"] = pd.DataFrame(Y)[0]
    file_name = "("+str(int(auc*10000)/100.0)+")"+datetime.now().strftime("%H:%M_%Y.%m.%d")+"_"+model.__class__.__name__+"_prediction.csv"
    test[["loan_id","loan_success"]].rename(columns={"loan_id":"Id","loan_success":"Predicted"}).to_csv("predictions/"+file_name,index=False)
    print(file_name+" saved successfully")

In [225]:
if type(model) == list:
    for m in model:
        saveModel(m)
else:
    saveModel(model)

(74.55)13:18_2021.12.12_RandomForestClassifier_prediction.csv saved successfully
