### TO DO
## Criar novos modelos

    Cross Validation
    Random Forest
    XGBoost
    Bagging
    Neural Net

## Métricas
    Para entender direto os resultados das predições dos modelos

## Analisar os dados mais profundamente
    Quais são melhores para continuar na modelação
    
    Quais podem ser alterados/ juntados
    
    Feature selection
    
## Analiar os modelos
    Analisar os parâmetros dos modelos
    
    Analisar mais modelos e o porque de usá-los
    
    Parameter tuning
    

In [13]:
%matplotlib inline
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
scaler = StandardScaler()
train = pd.read_csv("data/final_loan_train.csv")
X_all = train.drop(columns=["loan_id","loan_success"])
Y_all = train["loan_success"]
split_size = 0.3
X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all,test_size=split_size, random_state=42, stratify=Y_all)

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [3]:
models = {}

## Decision Tree

In [4]:
decisionTree = DecisionTreeClassifier()
decisionTree.fit(X_train, Y_train)
Y_pred = decisionTree.predict_proba(X_test)

fpr, tpr, thresholds = metrics.roc_curve(Y_test, pd.DataFrame(Y_pred)[0].tolist(), pos_label=-1)
auc = metrics.auc(fpr, tpr)
models[auc] = decisionTree
auc

0.6613445378151261

## KNN

In [5]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
Y_pred = knn.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(Y_test, pd.DataFrame(Y_pred)[0].tolist(), pos_label=-1)
auc = metrics.auc(fpr, tpr)
models[auc] = knn
auc

0.48991596638655466

## Logistic Regression

In [6]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(Y_test, pd.DataFrame(Y_pred)[0].tolist(), pos_label=-1)
auc = metrics.auc(fpr, tpr)
models[auc] = logreg
auc

0.6310924369747899

## SVM

In [7]:
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, Y_train)
Y_pred = svm.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(Y_test, pd.DataFrame(Y_pred)[0].tolist(), pos_label=-1)
auc = metrics.auc(fpr, tpr)
models[auc] = svm
auc

0.6184873949579832

## Naive Bayes

In [8]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
Y_pred = gnb.predict(X_test)
Y_pred = gnb.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(Y_test, pd.DataFrame(Y_pred)[0].tolist(), pos_label=-1)
auc = metrics.auc(fpr, tpr)
models[auc] = gnb
auc

0.5638655462184874

# Save result

In [12]:
maxauc = max(models)
model = models[maxauc]
model

DecisionTreeClassifier()

In [19]:
test = pd.read_csv("data/final_loan_test.csv")
X = test.drop(columns=["loan_id","loan_success"])
scaler.fit(X)
X = scaler.transform(X)
Y = model.predict_proba(X)
test["loan_success"] = pd.DataFrame(Y)[0]
file_name = "("+str(int(maxauc*10000)/100.0)+")"+datetime.now().strftime("%H:%M_%Y.%m.%d")+"_"+model.__class__.__name__+"_prediction.csv"
test[["loan_id","loan_success"]].rename(columns={"loan_id":"Id","loan_success":"Predicted"}).to_csv("predictions/"+file_name,index=False)
file_name+" saved successfully"

'(66.13)15:36_2021.12.03_DecisionTreeClassifier_prediction.csv saved successfully'