In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import metrics

# Base

In [2]:
# Importando a base
train = pd.read_csv('train_treated.csv', index_col=0)

## Dividindo entre treino e teste

In [3]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train.drop(columns=['Survived']), train['Survived'], test_size=0.25, random_state=0)

# Importando os modelos 

In [4]:
import pickle
filename = 'amostragem_scores'
infile = open(filename,'rb')
metrics_dict = pickle.load(infile)
infile.close()

In [5]:
metrics_dict.keys()

dict_keys(['Ada Logistic', 'Bagging SVM', 'Ada Decision Tree', 'Bagging Neural'])

In [6]:
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Ada Logistic,0.807175,0.753086,0.72619,0.739394,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(LogisticRegression(C=1.0, class_weight=None, ..."
1,Bagging SVM,0.784753,0.719512,0.702381,0.710843,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=300, break_ties=False, cache_size=200, ..."
2,Ada Decision Tree,0.807175,0.759494,0.714286,0.736196,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(DecisionTreeClassifier(ccp_alpha=0.0, class_w..."
3,Bagging Neural,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","(MLPClassifier(activation='relu', alpha=0.0001..."


In [7]:
def update_dict(name, predict, classifier, dictionary):
    dictionary[name] = {
        'label': name, 
        'accuracy': metrics.accuracy_score(predict, y_test), 
        'recall': metrics.recall_score(predict, y_test), 
        'precision': metrics.precision_score(predict, y_test), 
        'F1': metrics.f1_score(predict, y_test),
        'predict': predict,
        'classifier': classifier
    }
    return dictionary
    
def update_score_dict(name, predict, classifier):
    update_dict(name, predict, classifier, metrics_dict)

# Teste com votação Maioria Simples/Uniforme 

In [8]:
ensemble_predict = []
for i in range(len(x_test)):
    votes = [0, 0]
    for metric_key in metrics_dict.keys():
        predict = metrics_dict[metric_key]['predict']
        index = predict[i] if isinstance(predict[i], int) else int(predict[i].round())
        votes[index] += 1
    ensemble_predict.append(votes.index(max(votes)))

In [9]:
metrics.accuracy_score(ensemble_predict, y_test)

0.7982062780269058

In [10]:
metrics.f1_score(ensemble_predict, y_test)

0.7239263803680981

# Classificador de Voting 

In [11]:
from sklearn.ensemble import VotingClassifier

In [12]:
estimators=[
    ('Logistic', metrics_dict['Ada Logistic']['classifier']),
    ('SVM', metrics_dict['Bagging SVM']['classifier']), 
    ('Decision', metrics_dict['Ada Decision Tree']['classifier']), 
    ('Neural', metrics_dict['Bagging Neural']['classifier'])
]

## Hard

In [13]:
voting_hard = VotingClassifier(estimators=estimators, voting='hard')

In [14]:
voting_hard.fit(x_train, y_train)
voting_predict_hard = voting_hard.predict(x_test)

In [15]:
metrics.accuracy_score(voting_predict_hard, y_test)

0.7982062780269058

In [16]:
metrics.f1_score(voting_predict_hard, y_test)

0.7239263803680981

O resultado o Voting classifier com parametro 'hard' é igual ao criado por mim no item acima

In [17]:
update_score_dict('Votação Simples (hard)', voting_predict_hard, voting_hard)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Ada Logistic,0.807175,0.753086,0.72619,0.739394,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(LogisticRegression(C=1.0, class_weight=None, ..."
1,Bagging SVM,0.784753,0.719512,0.702381,0.710843,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=300, break_ties=False, cache_size=200, ..."
2,Ada Decision Tree,0.807175,0.759494,0.714286,0.736196,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(DecisionTreeClassifier(ccp_alpha=0.0, class_w..."
3,Bagging Neural,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","(MLPClassifier(activation='relu', alpha=0.0001..."
4,Votação Simples (hard),0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


## Soft

In [18]:
voting_soft = VotingClassifier(estimators=estimators, voting='soft')

In [19]:
voting_soft.fit(x_train, y_train)
voting_predict_soft = voting_soft.predict(x_test)

In [20]:
update_score_dict('Votação Simples (soft)', voting_predict_soft, voting_soft)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Ada Logistic,0.807175,0.753086,0.72619,0.739394,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(LogisticRegression(C=1.0, class_weight=None, ..."
1,Bagging SVM,0.784753,0.719512,0.702381,0.710843,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=300, break_ties=False, cache_size=200, ..."
2,Ada Decision Tree,0.807175,0.759494,0.714286,0.736196,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(DecisionTreeClassifier(ccp_alpha=0.0, class_w..."
3,Bagging Neural,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","(MLPClassifier(activation='relu', alpha=0.0001..."
4,Votação Simples (hard),0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
5,Votação Simples (soft),0.802691,0.738095,0.738095,0.738095,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


O resultado do classifier com 'soft' foi um pouco melhor

In [21]:
# Ordered by F1
f1_list = sorted(list(metrics_dict.values()), key=lambda k: k['F1']) 
f1_ordered = pd.DataFrame(f1_list)
f1_ordered

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Bagging SVM,0.784753,0.719512,0.702381,0.710843,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=300, break_ties=False, cache_size=200, ..."
1,Votação Simples (hard),0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
2,Ada Decision Tree,0.807175,0.759494,0.714286,0.736196,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(DecisionTreeClassifier(ccp_alpha=0.0, class_w..."
3,Votação Simples (soft),0.802691,0.738095,0.738095,0.738095,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
4,Ada Logistic,0.807175,0.753086,0.72619,0.739394,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(LogisticRegression(C=1.0, class_weight=None, ..."
5,Bagging Neural,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","(MLPClassifier(activation='relu', alpha=0.0001..."


In [22]:
# Ordered by Accuracy
accuracy_list = sorted(list(metrics_dict.values()), key=lambda k: k['accuracy']) 
accuracy_ordered = pd.DataFrame(accuracy_list)
accuracy_ordered

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Bagging SVM,0.784753,0.719512,0.702381,0.710843,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=300, break_ties=False, cache_size=200, ..."
1,Votação Simples (hard),0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
2,Votação Simples (soft),0.802691,0.738095,0.738095,0.738095,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
3,Ada Logistic,0.807175,0.753086,0.72619,0.739394,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(LogisticRegression(C=1.0, class_weight=None, ..."
4,Ada Decision Tree,0.807175,0.759494,0.714286,0.736196,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(DecisionTreeClassifier(ccp_alpha=0.0, class_w..."
5,Bagging Neural,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","(MLPClassifier(activation='relu', alpha=0.0001..."


Mas ainda não melhor dentre todos os classificadores

# Votação com pesos (naive)

In [23]:
# colocando pesos maiores nos melhores classificadores baseados no F1
voting_weighted_naive = VotingClassifier(estimators=estimators, weights=[1.5, 0.5, 1, 2], voting='hard')

In [24]:
voting_weighted_naive.fit(x_train, y_train)
voting_weighted_naive_predict = voting_weighted_naive.predict(x_test)

In [25]:
update_score_dict('Votação com pesos naive (hard)', voting_weighted_naive_predict, voting_weighted_naive)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Ada Logistic,0.807175,0.753086,0.72619,0.739394,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(LogisticRegression(C=1.0, class_weight=None, ..."
1,Bagging SVM,0.784753,0.719512,0.702381,0.710843,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=300, break_ties=False, cache_size=200, ..."
2,Ada Decision Tree,0.807175,0.759494,0.714286,0.736196,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(DecisionTreeClassifier(ccp_alpha=0.0, class_w..."
3,Bagging Neural,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","(MLPClassifier(activation='relu', alpha=0.0001..."
4,Votação Simples (hard),0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
5,Votação Simples (soft),0.802691,0.738095,0.738095,0.738095,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
6,Votação com pesos naive (hard),0.802691,0.743902,0.72619,0.73494,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


# Votação com pesos (normalized)

In [27]:
data = [
    metrics_dict['Ada Logistic']['F1'],
    metrics_dict['Bagging SVM']['F1'], 
    metrics_dict['Ada Decision Tree']['F1'], 
    metrics_dict['Bagging Neural']['F1']
]
data = np.expand_dims(data, 1)

In [28]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(data)    
ret = scaler.transform(data)
ret = np.squeeze(ret)
ret += 1
ret

array([1.90145648, 1.        , 1.80049471, 2.        ])

In [29]:
# colocando pesos maiores nos melhores classificadores baseados no F1 (com normalizacao)
voting_weighted_normalized = VotingClassifier(estimators=estimators, weights=ret, voting='hard')

In [30]:
voting_weighted_normalized.fit(x_train, y_train)
voting_weighted_normalized_predict = voting_weighted_normalized.predict(x_test)

In [31]:
update_score_dict('Votação com pesos normalizado (hard)', voting_weighted_normalized_predict, voting_weighted_normalized)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Ada Logistic,0.807175,0.753086,0.72619,0.739394,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(LogisticRegression(C=1.0, class_weight=None, ..."
1,Bagging SVM,0.784753,0.719512,0.702381,0.710843,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=300, break_ties=False, cache_size=200, ..."
2,Ada Decision Tree,0.807175,0.759494,0.714286,0.736196,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(DecisionTreeClassifier(ccp_alpha=0.0, class_w..."
3,Bagging Neural,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","(MLPClassifier(activation='relu', alpha=0.0001..."
4,Votação Simples (hard),0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
5,Votação Simples (soft),0.802691,0.738095,0.738095,0.738095,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
6,Votação com pesos naive (hard),0.802691,0.743902,0.72619,0.73494,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
7,Votação com pesos normalizado (hard),0.802691,0.738095,0.738095,0.738095,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


# Score

In [32]:
# Ordered by F1
f1_list = sorted(list(metrics_dict.values()), key=lambda k: k['F1']) 
f1_ordered = pd.DataFrame(f1_list)
f1_ordered

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Bagging SVM,0.784753,0.719512,0.702381,0.710843,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=300, break_ties=False, cache_size=200, ..."
1,Votação Simples (hard),0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
2,Votação com pesos naive (hard),0.802691,0.743902,0.72619,0.73494,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
3,Ada Decision Tree,0.807175,0.759494,0.714286,0.736196,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(DecisionTreeClassifier(ccp_alpha=0.0, class_w..."
4,Votação Simples (soft),0.802691,0.738095,0.738095,0.738095,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
5,Votação com pesos normalizado (hard),0.802691,0.738095,0.738095,0.738095,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
6,Ada Logistic,0.807175,0.753086,0.72619,0.739394,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(LogisticRegression(C=1.0, class_weight=None, ..."
7,Bagging Neural,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","(MLPClassifier(activation='relu', alpha=0.0001..."
