### TO DO
## Criar novos modelos

    Cross Validation
    
    testar parametrizações
    SMOTE

## Métricas
    Para entender direto os resultados das predições dos modelos

## Analisar os dados mais profundamente
    Quais são melhores para continuar na modelação
    
    Quais podem ser alterados/ juntados
    
    Feature selection
    
## Analiar os modelos
    Analisar os parâmetros dos modelos
    
    Analisar mais modelos e o porque de usá-los
    
    Parameter tuning
    

In [84]:
%matplotlib inline
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.compose import make_column_transformer
from imblearn.over_sampling import SMOTE 

In [85]:
train = pd.read_csv("data/final_loan_train.csv")
X_all = train.drop(columns=["loan_id","loan_success"])
Y_all = train["loan_success"]

categorical_cols = [col for col in X_all.columns if X_all[col].dtype == object]
scalar_cols = [col for col in X_all.columns if X_all[col].dtype != object]

scaler = make_column_transformer((StandardScaler(), scalar_cols),
                                 (OneHotEncoder(), categorical_cols))

scaler.fit_transform(X_all)
X_all = scaler.transform(X_all)

sm = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(X_all, Y_all)


split_size = 30
#X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all,test_size=split_size, random_state=42, stratify=Y_all)

In [86]:
models = {}

In [87]:
def cross_validation_auc(X, Y, model, split_size, repetitions=43, split_variation = 15):
    aucs = []
    best = None
    for i in range(repetitions):
        for j in range(split_size,split_variation, -1):
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=j/100, random_state=i, stratify=Y)
            model.fit(X_train, Y_train)
            Y_pred = model.predict_proba(X_test)
            fpr, tpr, _ = metrics.roc_curve(Y_test, pd.DataFrame(Y_pred)[0].tolist(), pos_label=-1)
            auc = metrics.auc(fpr, tpr)
            if len(aucs) == 0 or auc > max(aucs):
                best = model
            aucs.append(auc)
    return (best, aucs)

In [88]:
def add_to_dict(key, val):
    if key in models.keys():
        if type(models[key]) == list:
            models[key].append(val)
        else:
            models[key] = [models[key], val]
    else:
        models[key] = val

## Decision Tree

In [89]:
dtree, aucs = cross_validation_auc(X_res, Y_res, DecisionTreeClassifier(), split_size)
add_to_dict(max(aucs), dtree)
max(aucs)

0.9374999999999999

## KNN

In [90]:
knn, aucs = cross_validation_auc(X_res, Y_res, KNeighborsClassifier(), split_size)
add_to_dict(max(aucs), knn)
max(aucs)

0.979951690821256

## Logistic Regression

In [91]:
logReg, aucs = cross_validation_auc(X_res, Y_res, LogisticRegression(max_iter=500), split_size)
add_to_dict(max(aucs), logReg)
max(aucs)

0.9534050179211468

## SVM

In [92]:
svm, aucs = cross_validation_auc(X_res, Y_res, SVC(kernel='linear', probability=True), split_size)
add_to_dict(max(aucs), svm)
max(aucs)

0.9483091787439615

## Naive Bayes

In [93]:
naive_bayes, aucs = cross_validation_auc(X_res, Y_res, GaussianNB(), split_size)
add_to_dict(max(aucs), naive_bayes)
max(aucs)

0.8809859154929578

## Random Forest

In [94]:
random_forest, aucs = cross_validation_auc(X_res, Y_res, RandomForestClassifier(), split_size)
add_to_dict(max(aucs), random_forest)
max(aucs)

1.0

## Gradient Boost

In [95]:
gradient, aucs = cross_validation_auc(X_res, Y_res, GradientBoostingClassifier(), split_size)
add_to_dict(max(aucs), gradient)
max(aucs)

1.0

## Bagging

In [96]:
bagging, aucs = cross_validation_auc(X_res, Y_res, BaggingClassifier(), split_size)
add_to_dict(max(aucs), bagging)
max(aucs)

0.9983050847457627

## Neural Network

In [97]:
#model, aucs = cross_validation_auc(X_res, Y_res, MLPClassifier(max_iter=1000, hidden_layer_sizes=(100,5), alpha=1e-5,), split_size)
#models[max(aucs)] = model
#max(aucs)

# Save result

In [98]:
maxauc = max(models)
model = models[maxauc]
model

[RandomForestClassifier(), GradientBoostingClassifier()]

In [99]:
def saveModel(model):
    test = pd.read_csv("data/final_loan_test.csv")
    X = test.drop(columns=["loan_id","loan_success"])
    scaler.fit(X)
    X = scaler.transform(X)
    Y = model.predict_proba(X)
    test["loan_success"] = pd.DataFrame(Y)[0]
    file_name = "("+str(int(maxauc*10000)/100.0)+")"+datetime.now().strftime("%H:%M_%Y.%m.%d")+"_"+model.__class__.__name__+"_prediction.csv"
    test[["loan_id","loan_success"]].rename(columns={"loan_id":"Id","loan_success":"Predicted"}).to_csv("predictions/"+file_name,index=False)
    print(file_name+" saved successfully")

In [100]:
if type(model) == list:
    for m in model:
        saveModel(m)
else:
    saveModel(model)

(100.0)16:35_2021.12.04_RandomForestClassifier_prediction.csv saved successfully
(100.0)16:35_2021.12.04_GradientBoostingClassifier_prediction.csv saved successfully
