In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import metrics

# Base

In [2]:
# Importando a base
train = pd.read_csv('train_treated.csv', index_col=0)

In [3]:
train.sample(5)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
808,0,3,1,18.0,0,0,7.775,0,0,1
373,0,3,0,19.0,0,0,8.05,0,0,1
417,1,2,1,34.0,1,1,32.5,0,0,1
411,0,3,0,29.699118,0,0,7.8958,0,0,1
509,0,3,0,28.0,0,0,22.525,0,0,1


## Dividindo entre treino e teste

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train.drop(columns=['Survived']), train['Survived'], test_size=0.25, random_state=0)

# Criando dicionario de scores

In [5]:
# Importando os melhores modelos pós amostragem
import pickle
filename = 'amostragem_scores'
infile = open(filename,'rb')
metrics_dict = pickle.load(infile)
infile.close()

In [6]:
metrics_dict.keys()

dict_keys(['Logistic GS', 'SVM GS Bagging', 'Decision GS', 'Neural GS'])

In [7]:
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS Bagging,0.789238,0.728395,0.702381,0.715152,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=492.1132712266245, break_ties=False, ca..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."


In [8]:
def update_dict(name, predict, classifier, dictionary):
    dictionary[name] = {
        'label': name, 
        'accuracy': metrics.accuracy_score(predict, y_test), 
        'recall': metrics.recall_score(predict, y_test), 
        'precision': metrics.precision_score(predict, y_test), 
        'F1': metrics.f1_score(predict, y_test),
        'predict': predict,
        'classifier': classifier
    }
    return dictionary
    
def update_score_dict(name, predict, classifier):
    update_dict(name, predict, classifier, metrics_dict)

# Métodos de Combinação

## Votação Maioria Simples/Uniforme (Implementado)

In [9]:
def SimpleEnsemble():
    ensemble_predict = []
    for i in range(len(x_test)):
        votes = [0, 0]
        for metric_key in metrics_dict.keys():
            predict = metrics_dict[metric_key]['predict']
            index = predict[i] if isinstance(predict[i], int) else int(predict[i].round())
            votes[index] += 1
        ensemble_predict.append(votes.index(max(votes)))
    return ensemble_predict

In [10]:
simpleEnsemble_predict = SimpleEnsemble()

In [11]:
update_score_dict('Simple Voting', simpleEnsemble_predict, None)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS Bagging,0.789238,0.728395,0.702381,0.715152,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=492.1132712266245, break_ties=False, ca..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",


## Classificador de Voting 

In [12]:
from sklearn.ensemble import VotingClassifier
estimators=[
    ('Logistic', metrics_dict['Logistic GS']['classifier']),
    ('SVM', metrics_dict['SVM GS Bagging']['classifier']), 
    ('Decision', metrics_dict['Decision GS']['classifier']), 
    ('Neural', metrics_dict['Neural GS']['classifier'])
]

### Hard

In [13]:
voting_hard = VotingClassifier(estimators=estimators, voting='hard')
voting_hard.fit(x_train, y_train)
voting_hard_predict = voting_hard.predict(x_test)

In [14]:
update_score_dict('VC Hard', voting_hard_predict, voting_hard)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS Bagging,0.789238,0.728395,0.702381,0.715152,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=492.1132712266245, break_ties=False, ca..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


### Soft

In [15]:
voting_soft = VotingClassifier(estimators=estimators, voting='soft')
voting_soft.fit(x_train, y_train)
voting_soft_predict = voting_soft.predict(x_test)

In [16]:
update_score_dict('VC Soft', voting_soft_predict, voting_soft)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS Bagging,0.789238,0.728395,0.702381,0.715152,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=492.1132712266245, break_ties=False, ca..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
6,VC Soft,0.811659,0.769231,0.714286,0.740741,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


### Pesos Normalizados

In [17]:
# colocando pesos normalizados nos melhores classificadores baseados no F1

In [18]:
data = [
    metrics_dict['Logistic GS']['F1'],
    metrics_dict['SVM GS Bagging']['F1'], 
    metrics_dict['Decision GS']['F1'], 
    metrics_dict['Neural GS']['F1']
]

In [19]:
from sklearn.preprocessing import MinMaxScaler
data = np.expand_dims(data, 1)
scaler = MinMaxScaler()
scaler.fit(data)    
ret = scaler.transform(data)
ret = np.squeeze(ret)
ret += 1
ret

array([1.07373691, 1.        , 2.        , 2.        ])

#### Hard

In [20]:
voting_hard_weighted = VotingClassifier(estimators=estimators, weights=ret, voting='hard')
voting_hard_weighted.fit(x_train, y_train)
voting_hard_weighted_predict = voting_hard_weighted.predict(x_test)

In [21]:
update_score_dict('VC Hard Weighted', voting_hard_weighted_predict, voting_hard_weighted)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS Bagging,0.789238,0.728395,0.702381,0.715152,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=492.1132712266245, break_ties=False, ca..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
6,VC Soft,0.811659,0.769231,0.714286,0.740741,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
7,VC Hard Weighted,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


#### Soft

In [22]:
voting_soft_weighted = VotingClassifier(estimators=estimators, weights=ret, voting='soft')
voting_soft_weighted.fit(x_train, y_train)
voting_soft_weighted_predict = voting_soft_weighted.predict(x_test)

In [23]:
update_score_dict('VC Soft Weighted', voting_soft_weighted_predict, voting_soft_weighted)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS Bagging,0.789238,0.728395,0.702381,0.715152,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=492.1132712266245, break_ties=False, ca..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
6,VC Soft,0.811659,0.769231,0.714286,0.740741,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
7,VC Hard Weighted,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
8,VC Soft Weighted,0.816143,0.772152,0.72619,0.748466,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


## Sem SVM, utilizando o melhor método

In [24]:
estimators_no_SVM = [
    ('Logistic', metrics_dict['Logistic GS']['classifier']),
    ('Decision', metrics_dict['Decision GS']['classifier']), 
    ('Neural', metrics_dict['Neural GS']['classifier'])
]

In [25]:
data_no_SVM = [
    metrics_dict['Logistic GS']['F1'],
    metrics_dict['Decision GS']['F1'], 
    metrics_dict['Neural GS']['F1']
]
data_no_SVM = np.expand_dims(data_no_SVM, 1)
scaler.fit(data_no_SVM)    
ret_no_SVM = scaler.transform(data_no_SVM)
ret_no_SVM = np.squeeze(ret_no_SVM)
ret_no_SVM += 1
ret_no_SVM

array([1., 2., 2.])

#### Soft

In [26]:
voting_soft_weighted = VotingClassifier(estimators=estimators_no_SVM, weights=ret_no_SVM, voting='soft')
voting_soft_weighted.fit(x_train, y_train)
voting_soft_weighted_predict = voting_soft_weighted.predict(x_test)

In [27]:
update_score_dict('VC Soft Weighted w/o SVM', voting_soft_weighted_predict, voting_soft_weighted)
pd.DataFrame(metrics_dict.values())

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
1,SVM GS Bagging,0.789238,0.728395,0.702381,0.715152,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=492.1132712266245, break_ties=False, ca..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
5,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
6,VC Soft,0.811659,0.769231,0.714286,0.740741,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
7,VC Hard Weighted,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
8,VC Soft Weighted,0.816143,0.772152,0.72619,0.748466,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
9,VC Soft Weighted w/o SVM,0.820628,0.775,0.738095,0.756098,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


## Scores

In [28]:
# Ordered by F1
newlist = sorted(list(metrics_dict.values()), key=lambda k: k['F1']) 
f1_ordered = pd.DataFrame(newlist)
f1_ordered

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,SVM GS Bagging,0.789238,0.728395,0.702381,0.715152,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=492.1132712266245, break_ties=False, ca..."
1,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
2,VC Hard,0.798206,0.746835,0.702381,0.723926,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
3,VC Soft,0.811659,0.769231,0.714286,0.740741,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
4,VC Hard Weighted,0.807175,0.746988,0.738095,0.742515,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
5,Simple Voting,0.816143,0.779221,0.714286,0.745342,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...",
6,VC Soft Weighted,0.816143,0.772152,0.72619,0.748466,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
7,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
8,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
9,VC Soft Weighted w/o SVM,0.820628,0.775,0.738095,0.756098,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."


## Best ensemble score vs models

In [33]:
for item in ['VC Hard', 'VC Soft','VC Hard Weighted', 'Simple Voting', 'VC Soft Weighted']:
    metrics_dict.pop(item)

In [34]:
# Ordered by F1
newlist = sorted(list(metrics_dict.values()), key=lambda k: k['F1']) 
f1_ordered = pd.DataFrame(newlist)
f1_ordered

Unnamed: 0,label,accuracy,recall,precision,F1,predict,classifier
0,SVM GS Bagging,0.789238,0.728395,0.702381,0.715152,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, ...","(SVC(C=492.1132712266245, break_ties=False, ca..."
1,Logistic GS,0.802691,0.777778,0.666667,0.717949,"[0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","LogisticRegression(C=0.045227288910538066, cla..."
2,Decision GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","DecisionTreeClassifier(ccp_alpha=0.0, class_we..."
3,Neural GS,0.820628,0.782051,0.72619,0.753086,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","MLPClassifier(activation='relu', alpha=0.01, b..."
4,VC Soft Weighted w/o SVM,0.820628,0.775,0.738095,0.756098,"[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, ...","VotingClassifier(estimators=[('Logistic',\n ..."
