### TO DO
## Criar novos modelos

    Cross Validation
    
    testar parametrizações
    SMOTE

## Métricas
    Para entender direto os resultados das predições dos modelos

## Analisar os dados mais profundamente
    Quais são melhores para continuar na modelação
    
    Quais podem ser alterados/ juntados
    
    Feature selection
    
## Analiar os modelos
    Analisar os parâmetros dos modelos
    
    Analisar mais modelos e o porque de usá-los
    
    Parameter tuning
    

In [2]:
%matplotlib inline
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.compose import make_column_transformer

In [25]:
train = pd.read_csv("data/final_loan_train.csv")
X_all = train.drop(columns=["loan_id","loan_success"])
Y_all = train["loan_success"]

categorical_cols = [col for col in X_all.columns if X_all[col].dtype == object]
scalar_cols = [col for col in X_all.columns if X_all[col].dtype != object]

scaler = make_column_transformer((StandardScaler(), scalar_cols),
                                 (OneHotEncoder(), categorical_cols))

scaler.fit_transform(X_all)
X_all = scaler.transform(X_all)

split_size = 30
X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all,test_size=split_size, random_state=42, stratify=Y_all)

In [4]:
models = {}

In [36]:
def cross_validation_auc(X, Y, model, split_size, repetitions=43, split_variation = 15):
    aucs = []
    best = None
    bestS = None
    for i in range(repetitions):
        for j in range(split_size,split_variation, -1):
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=j/100, random_state=i, stratify=Y)
            model.fit(X_train, Y_train)
            Y_pred = model.predict_proba(X_test)
            fpr, tpr, _ = metrics.roc_curve(Y_test, pd.DataFrame(Y_pred)[0].tolist(), pos_label=-1)
            auc = metrics.auc(fpr, tpr)
            if len(aucs) == 0 or auc > max(aucs):
                best = model
                bestS = j/100
            aucs.append(auc)
    
    print(bestS)
    return (best, aucs)

## Decision Tree

In [37]:
model, aucs = cross_validation_auc(X_all, Y_all, DecisionTreeClassifier(), split_size)
models[max(aucs)] = model
max(aucs)

0.24


0.9177807486631017

## KNN

In [38]:
model, aucs = cross_validation_auc(X_all, Y_all, KNeighborsClassifier(), split_size)
models[max(aucs)] = model
max(aucs)

0.16


0.8835403726708075

## Logistic Regression

In [39]:
model, aucs = cross_validation_auc(X_all, Y_all, LogisticRegression(), split_size)
models[max(aucs)] = model
max(aucs)

0.16


0.9006211180124224

## SVM

In [40]:
model, aucs = cross_validation_auc(X_all, Y_all, SVC(kernel='linear', probability=True), split_size)
models[max(aucs)] = model
max(aucs)

0.18


0.8918269230769231

## Naive Bayes

In [41]:
model, aucs = cross_validation_auc(X_all, Y_all, GaussianNB(), split_size)
models[max(aucs)] = model
max(aucs)

0.2


0.8391812865497076

## Random Forest

In [42]:
model, aucs = cross_validation_auc(X_all, Y_all, RandomForestClassifier(), split_size)
models[max(aucs)] = model
max(aucs)

0.16


0.9968944099378881

## Gradient Boost

In [43]:
model, aucs = cross_validation_auc(X_all, Y_all, GradientBoostingClassifier(), split_size)
models[max(aucs)] = model
max(aucs)

0.18


0.9783653846153846

## Bagging

In [44]:
model, aucs = cross_validation_auc(X_all, Y_all, BaggingClassifier(), split_size)
models[max(aucs)] = model
max(aucs)

0.2


0.9805068226120858

## Neural Network

In [34]:
model, aucs = cross_validation_auc(X_all, Y_all, MLPClassifier(max_iter=1000, hidden_layer_sizes=(100,5), alpha=1e-5,), split_size)
models[max(aucs)] = model
max(aucs)



0.8579831932773109

# Save result

In [45]:
maxauc = max(models)
model = models[maxauc]
model

RandomForestClassifier()

In [46]:
test = pd.read_csv("data/final_loan_test.csv")
X = test.drop(columns=["loan_id","loan_success"])
scaler.fit(X)
X = scaler.transform(X)
Y = model.predict_proba(X)
test["loan_success"] = pd.DataFrame(Y)[0]
file_name = "("+str(int(maxauc*10000)/100.0)+")"+datetime.now().strftime("%H:%M_%Y.%m.%d")+"_"+model.__class__.__name__+"_prediction.csv"
test[["loan_id","loan_success"]].rename(columns={"loan_id":"Id","loan_success":"Predicted"}).to_csv("predictions/"+file_name,index=False)
file_name+" saved successfully"

'(99.68)00:07_2021.12.04_RandomForestClassifier_prediction.csv saved successfully'