In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import metrics

# Base

In [2]:
# Importando a base
train = pd.read_csv('train_treated.csv', index_col=0)

In [3]:
train.sample(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
50,0,3,1,18.0,1,0,17.8,0,0,1
546,0,1,0,64.0,0,0,26.0,0,0,1
87,0,3,0,16.0,1,3,34.375,0,0,1
162,1,2,1,40.0,0,0,15.75,0,0,1
185,1,3,1,4.0,0,2,22.025,0,0,1


## Dividindo entre treino e teste

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train.drop(columns=['Survived']), train['Survived'], test_size=0.25, random_state=0)

# Criando dicionario de scores

In [29]:
# Importando os melhores modelos pós gridsearch
import pickle
filename = 'gridsearch_scores'
infile = open(filename,'rb')
metrics_dict = pickle.load(infile)
infile.close()

In [30]:
metrics_dict.keys()

dict_keys(['Logistic GS', 'SVM GS', 'Decision GS', 'Neural GS'])

In [31]:
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS,0.784753,0.714286,0.714286,0.714286,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","SVC(C=492.1132712266245, break_ties=False, cac..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."


In [32]:
def update_dict(name, predict, classifier, dictionary):
    dictionary[name] = {
        'label': name, 
        'accuracy': metrics.accuracy_score(predict, y_test), 
        'recall': metrics.recall_score(predict, y_test), 
        'precision': metrics.precision_score(predict, y_test), 
        'F1': metrics.f1_score(predict, y_test),
        'predict': predict,
        'classifier': classifier
    }
    return dictionary
    
def update_score_dict(name, predict, classifier):
    update_dict(name, predict, classifier, metrics_dict)

# Métodos de Combinação

## Votação Maioria Simples/Uniforme (Implementado)

In [33]:
def SimpleEnsemble():
    ensemble_predict = []
    for i in range(len(x_test)):
        votes = [0, 0]
        for metric_key in metrics_dict.keys():
            predict = metrics_dict[metric_key]['predict']
            index = predict[i] if isinstance(predict[i], int) else int(predict[i].round())
            votes[index] += 1
        ensemble_predict.append(votes.index(max(votes)))
    return ensemble_predict

In [34]:
simpleEnsemble_predict = SimpleEnsemble()

In [35]:
update_score_dict('Simple Voting', simpleEnsemble_predict, None)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS,0.784753,0.714286,0.714286,0.714286,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","SVC(C=492.1132712266245, break_ties=False, cac..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",


## Classificador de Voting 

In [36]:
from sklearn.ensemble import VotingClassifier
estimators=[
    ('Logistic', metrics_dict['Logistic GS']['classifier']),
    ('SVM', metrics_dict['SVM GS']['classifier']), 
    ('Decision', metrics_dict['Decision GS']['classifier']), 
    ('Neural', metrics_dict['Neural GS']['classifier'])
]

### Hard

In [37]:
voting_hard = VotingClassifier(estimators=estimators, voting='hard')
voting_hard.fit(x_train, y_train)
voting_hard_predict = voting_hard.predict(x_test)

In [38]:
update_score_dict('VC Hard', voting_hard_predict, voting_hard)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS,0.784753,0.714286,0.714286,0.714286,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","SVC(C=492.1132712266245, break_ties=False, cac..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",


### Soft

In [39]:
voting_soft = VotingClassifier(estimators=estimators, voting='soft')
voting_soft.fit(x_train, y_train)
voting_soft_predict = voting_soft.predict(x_test)

In [40]:
update_score_dict('VC Soft', voting_soft_predict, voting_soft)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS,0.784753,0.714286,0.714286,0.714286,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","SVC(C=492.1132712266245, break_ties=False, cac..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
6,VC Soft,0.811659,0.7625,0.72619,0.743902,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",


### Pesos Normalizados

In [45]:
# colocando pesos normalizados nos melhores classificadores baseados no F1

In [41]:
data = [
    metrics_dict['Logistic GS']['F1'],
    metrics_dict['SVM GS']['F1'], 
    metrics_dict['Decision GS']['F1'], 
    metrics_dict['Neural GS']['F1']
]

In [42]:
from sklearn.preprocessing import MinMaxScaler
data = np.expand_dims(data, 1)
scaler = MinMaxScaler()
scaler.fit(data)    
ret = scaler.transform(data)
ret = np.squeeze(ret)
ret += 1
ret

array([1.09440559, 1.        , 2.        , 2.        ])

#### Hard

In [44]:
voting_hard_weighted = VotingClassifier(estimators=estimators, weights=ret, voting='hard')
voting_hard_weighted.fit(x_train, y_train)
voting_hard_weighted_predict = voting_hard_weighted.predict(x_test)

In [45]:
update_score_dict('VC Hard Weighted', voting_hard_weighted_predict, voting_hard_weighted)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS,0.784753,0.714286,0.714286,0.714286,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","SVC(C=492.1132712266245, break_ties=False, cac..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
6,VC Soft,0.811659,0.7625,0.72619,0.743902,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
7,VC Hard Weighted,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


#### Soft

In [47]:
voting_soft_weighted = VotingClassifier(estimators=estimators, weights=ret, voting='soft')
voting_soft_weighted.fit(x_train, y_train)
voting_soft_weighted_predict = voting_soft_weighted.predict(x_test)

In [48]:
update_score_dict('VC Soft Weighted', voting_soft_weighted_predict, voting_soft_weighted)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS,0.784753,0.714286,0.714286,0.714286,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","SVC(C=492.1132712266245, break_ties=False, cac..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
6,VC Soft,0.811659,0.7625,0.72619,0.743902,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
7,VC Hard Weighted,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
8,VC Soft Weighted,0.816143,0.772152,0.72619,0.748466,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


## Scores

In [54]:
# Ordered by F1
newlist = sorted(list(metrics_dict.values()), key=lambda k: k['F1']) 
f1_ordered = pd.DataFrame(newlist)
f1_ordered

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,SVM GS,0.784753,0.714286,0.714286,0.714286,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","SVC(C=492.1132712266245, break_ties=False, cac..."
1,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
2,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
3,VC Hard Weighted,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
4,VC Soft,0.811659,0.7625,0.72619,0.743902,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
6,VC Soft Weighted,0.816143,0.772152,0.72619,0.748466,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
7,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
8,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."


## Teste sem SVM, utilizando o melhor método

In [56]:
estimators_no_SVM = [
    ('Logistic', metrics_dict['Logistic GS']['classifier']),
    ('Decision', metrics_dict['Decision GS']['classifier']), 
    ('Neural', metrics_dict['Neural GS']['classifier'])
]


In [59]:
data_no_SVM = [
    metrics_dict['Logistic GS']['F1'],
    metrics_dict['Decision GS']['F1'], 
    metrics_dict['Neural GS']['F1']
]
data_no_SVM = np.expand_dims(data_no_SVM, 1)
scaler.fit(data_no_SVM)    
ret_no_SVM = scaler.transform(data_no_SVM)
ret_no_SVM = np.squeeze(ret_no_SVM)
ret_no_SVM += 1
ret_no_SVM

array([1., 2., 2.])

#### Soft

In [60]:
voting_soft_weighted = VotingClassifier(estimators=estimators_no_SVM, weights=ret_no_SVM, voting='soft')
voting_soft_weighted.fit(x_train, y_train)
voting_soft_weighted_predict = voting_soft_weighted.predict(x_test)

In [61]:
update_score_dict('VC Soft Weighted w/o SVM', voting_soft_weighted_predict, voting_soft_weighted)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS,0.784753,0.714286,0.714286,0.714286,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","SVC(C=492.1132712266245, break_ties=False, cac..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
6,VC Soft,0.811659,0.7625,0.72619,0.743902,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
7,VC Hard Weighted,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
8,VC Soft Weighted,0.816143,0.772152,0.72619,0.748466,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
9,VC Soft Weighted w/o SVM,0.820628,0.775,0.738095,0.756098,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
