### TO DO
## Criar novos modelos

    Cross Validation
    
    testar parametrizações
    SMOTE

## Métricas
    Para entender direto os resultados das predições dos modelos

## Analisar os dados mais profundamente
    Quais são melhores para continuar na modelação
    
    Quais podem ser alterados/ juntados
    
    Feature selection
    
## Analiar os modelos
    Analisar os parâmetros dos modelos
    
    Analisar mais modelos e o porque de usá-los
    
    Parameter tuning
    

In [12]:
%matplotlib inline
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.compose import make_column_transformer
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE 
from sklearn.feature_selection import VarianceThreshold

In [13]:
train = pd.read_csv("data/final_loan_train.csv")
X_all = train.drop(columns=["loan_id","loan_success"])
Y_all = train["loan_success"]

categorical_cols = [col for col in X_all.columns if X_all[col].dtype == object]
scalar_cols = [col for col in X_all.columns if X_all[col].dtype != object]

scaler = make_column_transformer((StandardScaler(), scalar_cols),
                                 (OneHotEncoder(), categorical_cols))
scaler.fit_transform(X_all)
X_all = scaler.transform(X_all)

#varianceSelector = VarianceThreshold(threshold=(.8 * (1 - .8)))
#varianceSelector.fit(X_res)

#X_res = varianceSelector.transform(X_res)

split_size = 3


In [6]:
def calc_auc(y_true, predictions):
    fpr, tpr, _ = metrics.roc_curve(y_true, pd.DataFrame(predictions)[0].tolist(), pos_label=-1)
    return metrics.auc(fpr, tpr)

In [20]:
def get_model_auc(X,Y, model,  k_folds=3, repetitions=43):
    auc = 0
    for r in range(repetitions):
        kfold = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=r)
        for train, test in kfold.split(X, Y):
            X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
            
            sm = SMOTE(random_state=42)
            X_train, Y_train = sm.fit_resample(X_train, Y_train)

            model.fit(X_train, Y_train)

            Y_pred = model.predict_proba(X_test)

            auc = max(auc, calc_auc(Y_test, Y_pred))
    return auc

In [27]:
def get_best_model(X, Y, models):
    best = None
    best_auc = 0
    for model in models:
        print("Model: "+model.__class__.__name__, end="")
        auc = get_model_auc(X,Y, model)
        print("\t auc: "+str(auc))
        if auc > best_auc:
            best, best_auc = model, auc
    return best, best_auc

## Get best model

In [29]:
models = [
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    LogisticRegression(max_iter=1000),
    SVC(kernel='linear', probability=True),
    GaussianNB(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    BaggingClassifier()
]

In [30]:
model, auc = get_best_model(X_all,Y_all, models)
model, auc

Model: DecisionTreeClassifier	 auc: 0.7845744680851064
Model: KNeighborsClassifier	 auc: 0.7822695035460994
Model: LogisticRegression	 auc: 0.8457446808510638
Model: SVC	 auc: 0.8368794326241135
Model: GaussianNB	 auc: 0.7319148936170213
Model: RandomForestClassifier	 auc: 0.9067375886524822
Model: GradientBoostingClassifier	 auc: 0.8822695035460992
Model: BaggingClassifier	 auc: 0.8921985815602836


(RandomForestClassifier(), 0.9067375886524822)

## Train model

In [None]:
sm = SMOTE(random_state=42)
X_train, Y_train = sm.fit_resample(X_all, Y_all)

model.fit(X_train, Y_train)

# Save result

In [144]:
def saveModel(model):
    test = pd.read_csv("data/final_loan_test.csv")
    X = test.drop(columns=["loan_id","loan_success"])
    scaler.fit(X)
    X = scaler.transform(X)
    Y = model.predict_proba(X)
    test["loan_success"] = pd.DataFrame(Y)[0]
    file_name = "("+str(int(auc*10000)/100.0)+")"+datetime.now().strftime("%H:%M_%Y.%m.%d")+"_"+model.__class__.__name__+"_prediction.csv"
    test[["loan_id","loan_success"]].rename(columns={"loan_id":"Id","loan_success":"Predicted"}).to_csv("predictions/"+file_name,index=False)
    print(file_name+" saved successfully")

In [145]:
if type(model) == list:
    for m in model:
        saveModel(m)
else:
    saveModel(model)

(99.82)16:04_2021.12.10_RandomForestClassifier_prediction.csv saved successfully
