In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import metrics

# Base

In [2]:
# Importando a base
train = pd.read_csv('train_treated.csv', index_col=0)

In [3]:
train.sample(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
885,0,3,0,25.0,0,0,7.05,0,0,1
541,1,1,1,36.0,0,2,71.0,0,0,1
79,1,2,0,0.83,0,2,29.0,0,0,1
109,0,3,0,38.0,0,0,7.8958,0,0,1
411,0,3,0,29.699118,0,0,7.8958,0,0,1


## Dividindo entre treino e teste

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train.drop(columns=['Survived']), train['Survived'], test_size=0.25, random_state=0)

# Criando dicionario de scores

In [5]:
# Importando os melhores modelos pós gridsearch
import pickle
filename = 'gridsearch_scores'
infile = open(filename,'rb')
metrics_dict = pickle.load(infile)
infile.close()

In [6]:
metrics_dict.keys()

dict_keys(['Logistic GS', 'SVM GS', 'Decision GS', 'Neural GS'])

In [7]:
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS,0.784753,0.714286,0.714286,0.714286,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","SVC(C=492.1132712266245, break_ties=False, cac..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."


In [8]:
def update_dict(name, predict, classifier, dictionary):
    dictionary[name] = {
        'label': name, 
        'accuracy': metrics.accuracy_score(predict, y_test), 
        'recall': metrics.recall_score(predict, y_test), 
        'precision': metrics.precision_score(predict, y_test), 
        'F1': metrics.f1_score(predict, y_test),
        'predict': predict,
        'classifier': classifier
    }
    return dictionary
    
def update_score_dict(name, predict, classifier):
    update_dict(name, predict, classifier, metrics_dict)

# Bagging e boosting

In [9]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

In [10]:
def test_num_estimators(classifier, estimator, n_est):
    num_estimators = n_est if n_est else [20, 25, 30, 40, 60, 80, 90, 100]
    results_dict = {elem: {} for elem in num_estimators}


    for i in num_estimators:
        classif = classifier(base_estimator=estimator, n_estimators=i, random_state=0)
        classif.fit(x_train, y_train)
        classif_predict = classif.predict(x_test)
        update_dict(i, classif_predict, classif, results_dict)
        
    return results_dict

## Boosting

In [11]:
def test_num_estimators_for_boosting(estimator, n_est=None):
    return test_num_estimators(AdaBoostClassifier, estimator, n_est)

## Bagging

In [12]:
def test_num_estimators_for_bagging(estimator, n_est=None):
    return test_num_estimators(BaggingClassifier, estimator, n_est)

# LogisticRegression

In [13]:
# Importando
from sklearn.linear_model import LogisticRegression
# Instanciando
logisticR = metrics_dict['Logistic GS']['classifier']

## Boosting

In [None]:
n_est_LR = [100, 200, 300, 350, 400, 410, 415, 420, 425, 450, 470, 500]
results_dict_LR = test_num_estimators_for_boosting(logisticR, n_est_LR)

In [None]:
# Ordered by F1
newlist_LR = sorted(list(results_dict_LR.values()), key=lambda k: k['F1']) 
pd.DataFrame(newlist_LR)

In [None]:
adaboost_logistic_best = results_dict_LR[400]

In [None]:
update_score_dict('Logistic GS Ada', adaboost_logistic_best['predict'], adaboost_logistic_best['classifier'])
pd.DataFrame(metrics_dict.values())

# Máquina de Vetor Suporte

In [None]:
# Importando
from sklearn.svm import SVC
# Instanciando
supportV = metrics_dict['SVM GS']['classifier']

## Bagging

In [None]:
results_dict_SVM = test_num_estimators_for_bagging(supportV)

In [None]:
# Ordered by F1
newlist_SVM = sorted(list(results_dict_SVM.values()), key=lambda k: k['F1']) 
pd.DataFrame(newlist_SVM)

In [None]:
n_est_SVM = [300, 400, 500, 600]
results_dict_SVM_high = test_num_estimators_for_bagging(supportV, n_est_SVM)

In [None]:
# Ordered by F1
newlist_SVM_high = sorted(list(results_dict_SVM_high.values()), key=lambda k: k['F1']) 
pd.DataFrame(newlist_SVM_high)

In [None]:
bagging_svm_best = results_dict_SVM[20]

In [None]:
update_score_dict('SVM GS Bagging', bagging_svm_best['predict'], bagging_svm_best['classifier'])
pd.DataFrame(metrics_dict.values())

# Decision tree

In [None]:
# Importando
from sklearn.tree import DecisionTreeClassifier
# Instanciando
decisionT = metrics_dict['Decision GS']['classifier']

## Boosting

In [None]:
n_est_DT = [20, 25, 30, 35, 37, 39, 40, 45, 50, 53, 55, 57, 60, 80, 90]
results_dict_DT = test_num_estimators_for_boosting(decisionT, n_est_DT)

In [None]:
# Ordered by F1
newlist_DT = sorted(list(results_dict_DT.values()), key=lambda k: k['F1']) 
pd.DataFrame(newlist_DT)

In [None]:
n_est_DT_high = [200, 300, 500, 600, 680, 700, 800, 815, 820, 825, 850, 900, 1000]
results_dict_DT_high = test_num_estimators_for_boosting(decisionT, n_est_DT_high)

In [None]:
# Ordered by F1
newlist_DT_high = sorted(list(results_dict_DT_high.values()), key=lambda k: k['F1']) 
pd.DataFrame(newlist_DT_high)

In [None]:
adaboost_decisionT_best = results_dict_DT_high[820]

In [None]:
update_score_dict('Decision GS Ada', adaboost_decisionT_best['predict'], adaboost_decisionT_best['classifier'])
pd.DataFrame(metrics_dict.values())

# Redes Neurais

In [None]:
# Importando
from sklearn.neural_network import MLPClassifier
# Instanciando
neuralN =  metrics_dict['Neural GS']['classifier']

## Bagging

In [None]:
n_est_NN = [10, 20, 23, 25, 30, 60, 65, 70, 80, 85, 100]
results_dict_NN = test_num_estimators_for_bagging(neuralN, n_est_NN)

In [None]:
# Ordered by F1
newlist_NN = sorted(list(results_dict_NN.values()), key=lambda k: k['F1']) 
pd.DataFrame(newlist_NN)

In [None]:
n_est_NN_low = [1, 5, 7, 8, 9, 10, 13, 15]
results_dict_NN_low = test_num_estimators_for_bagging(neuralN, n_est_NN_low)

In [None]:
# Ordered by F1
newlist_NN_low = sorted(list(results_dict_NN_low.values()), key=lambda k: k['F1']) 
pd.DataFrame(newlist_NN_low)

In [None]:
bagging_neuralN_best = results_dict_NN[10]

In [None]:
update_score_dict('Neural GS Bagging', bagging_neuralN_best['predict'], bagging_neuralN_best['classifier'])

# Scores

In [None]:
pd.DataFrame(metrics_dict.values())

In [None]:
# Ordered by F1
newlist = sorted(list(metrics_dict.values()), key=lambda k: k['F1']) 
f1_ordered = pd.DataFrame(newlist)
f1_ordered

In [None]:
for keys in [['Logistic GS', 'Logistic GS Ada'], ['SVM GS', 'SVM GS Bagging'], ['Decision GS', 'Decision GS Ada'], ['Neural GS', 'Neural GS Bagging']]:
    selected_methods_dict = {x:metrics_dict[x] for x in keys}
    
    print(pd.DataFrame(selected_methods_dict).drop(['label', 'predict', 'classifier', 'recall', 'precision']))

In [None]:
pd.DataFrame(metrics_dict).drop(['label', 'recall', 'precision', 'predict', 'classifier']).plot(kind='bar', figsize=(9,3))

# Salvando resultados

In [None]:
keys = ['Logistic GS', 'SVM GS Bagging', 'Decision GS', 'Neural GS']
selected_methods_dict = {x:metrics_dict[x] for x in keys}

In [None]:
# salvando 
import pickle
filename = 'amostragem_scores'
outfile = open(filename, 'wb')
pickle.dump(selected_methods_dict, outfile)
outfile.close()