# Titanic - Modelo

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.tree import DecisionTreeClassifier, plot_tree

from feature_engine import categorical_encoders as ce

plt.style.use("ggplot")

## Lê os dados
Nessa etapa, são importados os dados tratados durante a EDA.

In [2]:
data_train = pd.read_csv("data/train-tratado.csv")

data_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,difFare,SibSp_I,Parch_I
0,0,3,male,22.000000,1,0,7.2500,U,S,-0.8000,L,L
1,1,1,female,38.000000,1,0,71.2833,C,C,10.9958,L,L
2,1,3,female,26.000000,0,0,7.9250,U,S,-0.1250,L,L
3,1,1,female,35.000000,1,0,53.1000,C,S,-7.1875,L,L
4,0,3,male,35.000000,0,0,8.0500,U,S,0.0000,L,L
...,...,...,...,...,...,...,...,...,...,...,...,...
779,0,3,female,39.000000,0,5,29.1250,U,Q,21.0750,L,H
780,1,1,female,19.000000,0,0,30.0000,B,S,-30.2875,L,L
781,0,3,female,30.626179,1,2,23.4500,U,S,15.4000,L,M
782,1,1,male,26.000000,0,0,30.0000,C,C,-30.2875,L,L


In [3]:
data_teste = pd.read_csv("data/teste-tratado.csv")
data_teste = data_teste.drop("Fare", axis = 1)

data_teste

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked,difFare,SibSp_I,Parch_I
0,892,3,male,34.500000,0,0,U,Q,-0.2208,L,L
1,893,3,female,47.000000,1,0,U,S,-1.0500,L,L
2,894,2,male,62.000000,0,0,U,Q,-9.0625,L,L
3,895,3,male,27.000000,0,0,U,S,0.6125,L,L
4,896,3,female,22.000000,1,1,U,S,4.2375,L,L
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,male,29.858135,0,0,U,S,0.0000,L,L
414,1306,1,female,39.000000,0,0,C,C,48.6125,L,L
415,1307,3,male,38.500000,0,0,U,S,-0.8000,L,L
416,1308,3,male,29.858135,0,0,U,S,0.0000,L,L


In [4]:
data_train.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked', 'difFare', 'SibSp_I', 'Parch_I'],
      dtype='object')

In [5]:
atributos_categoricos = ["Pclass", "Sex", "Parch_I", "SibSp_I", "Cabin", "Embarked"]

for atributo in atributos_categoricos:
    data_train[atributo] = data_train[atributo].astype("O")
    data_teste[atributo] = data_teste[atributo].astype("O")

In [6]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 784 entries, 0 to 783
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  784 non-null    int64  
 1   Pclass    784 non-null    object 
 2   Sex       784 non-null    object 
 3   Age       784 non-null    float64
 4   SibSp     784 non-null    int64  
 5   Parch     784 non-null    int64  
 6   Fare      784 non-null    float64
 7   Cabin     784 non-null    object 
 8   Embarked  784 non-null    object 
 9   difFare   784 non-null    float64
 10  SibSp_I   784 non-null    object 
 11  Parch_I   784 non-null    object 
dtypes: float64(3), int64(3), object(6)
memory usage: 73.6+ KB


In [7]:
atributos_continuos = ["Age", "difFare"]

## Separação no conjunto de treino para testar o modelo

In [8]:
X_train = data_train[atributos_categoricos + atributos_continuos]
y_train = data_train["Survived"]

## Aplicando o `frequency enconder` nos atributos categóricos
O frequency encoder foi escolhido pois é eficiente com modelos lineares e modelos de árvore.

In [9]:
encoder = ce.CountFrequencyCategoricalEncoder(encoding_method = "frequency", variables = atributos_categoricos)

encoder

CountFrequencyCategoricalEncoder(encoding_method='frequency',
                                 variables=['Pclass', 'Sex', 'Parch_I',
                                            'SibSp_I', 'Cabin', 'Embarked'])

In [10]:
encoder.fit(X_train)

CountFrequencyCategoricalEncoder(encoding_method='frequency',
                                 variables=['Pclass', 'Sex', 'Parch_I',
                                            'SibSp_I', 'Cabin', 'Embarked'])

In [11]:
# Dados com enconder

train_t = encoder.transform(X_train)

In [12]:
train_t.sort_index()

Unnamed: 0,Pclass,Sex,Parch_I,SibSp_I,Cabin,Embarked,Age,difFare
0,0.516582,0.626276,0.885204,0.915816,0.741071,0.727041,22.000000,-0.8000
1,0.272959,0.373724,0.885204,0.915816,0.075255,0.197704,38.000000,10.9958
2,0.516582,0.373724,0.885204,0.915816,0.741071,0.727041,26.000000,-0.1250
3,0.272959,0.373724,0.885204,0.915816,0.075255,0.727041,35.000000,-7.1875
4,0.516582,0.626276,0.885204,0.915816,0.741071,0.727041,35.000000,0.0000
...,...,...,...,...,...,...,...,...
779,0.516582,0.373724,0.012755,0.915816,0.741071,0.075255,39.000000,21.0750
780,0.272959,0.373724,0.885204,0.915816,0.058673,0.727041,19.000000,-30.2875
781,0.516582,0.373724,0.102041,0.915816,0.741071,0.727041,30.626179,15.4000
782,0.272959,0.626276,0.885204,0.915816,0.075255,0.197704,26.000000,-30.2875


In [13]:
encoder.encoder_dict_

{'Pclass': {3: 0.5165816326530612,
  1: 0.2729591836734694,
  2: 0.21045918367346939},
 'Sex': {'male': 0.6262755102040817, 'female': 0.3737244897959184},
 'Parch_I': {'L': 0.8852040816326531,
  'M': 0.10204081632653061,
  'H': 0.012755102040816327},
 'SibSp_I': {'L': 0.9158163265306123,
  'M': 0.05229591836734694,
  'H': 0.03188775510204082},
 'Cabin': {'U': 0.7410714285714286,
  'C': 0.07525510204081633,
  'B': 0.058673469387755105,
  'D': 0.042091836734693876,
  'E': 0.04081632653061224,
  'A': 0.01913265306122449,
  'F': 0.016581632653061226,
  'G': 0.00510204081632653,
  'T': 0.0012755102040816326},
 'Embarked': {'S': 0.7270408163265306,
  'C': 0.19770408163265307,
  'Q': 0.07525510204081633}}

## Random Forest
Esse modelo foi escolhido pois lida facilmente com outliers, e possui se associa features importantes.
O Grid Search, foi utilizado em conjunto com cross validation para otimizar os hiperparâmetros.

In [14]:
rnd_forest = RandomForestClassifier(random_state = 2)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 5, 6, 7, 8, 10],
    'criterion' : ['gini', 'entropy']
}

CV_rndf = GridSearchCV(estimator = rnd_forest, param_grid = param_grid, cv = 5)

In [15]:
CV_rndf.fit(train_t, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8, 10],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [16]:
CV_rndf.best_params_

{'criterion': 'gini',
 'max_depth': 6,
 'max_features': 'log2',
 'n_estimators': 500}

Com isso obtemos os melhores parâmetros para o modelo.

## Aplicando o melhor modelo encontrando em todo o conjunto de treino
Nesse segmento, o encoding foi aplicado e o modelo teve o fit feito com os dados de treino, depois foram geradas as predições para a submissão.

In [17]:
# Separa o conjunto de dados

X_train = data_train[atributos_categoricos + atributos_continuos]
X_test = data_teste[atributos_categoricos + atributos_continuos]
y_train = data_train["Survived"]

In [18]:
# Aplica o enconder

encoder = ce.CountFrequencyCategoricalEncoder(encoding_method = "frequency", variables = atributos_categoricos)
encoder.fit(X_train)

# Dados com enconder

train_t = encoder.transform(X_train)
test_t = encoder.transform(X_test)

In [19]:
rnd_forest = RandomForestClassifier(random_state = 2, n_estimators = 500, criterion = "gini", max_depth = 6, max_features = "log2")
rnd_forest = rnd_forest.fit(train_t, y_train)

rnd_forest

RandomForestClassifier(max_depth=6, max_features='log2', n_estimators=500,
                       random_state=2)

In [20]:
result = permutation_importance(rnd_forest, train_t, y_train, n_repeats = 5, random_state = 2)

In [21]:
feature_importance = result.importances_mean
features = list(X_train.columns)

for i in range(len(features)):
    print("{}: {}".format(features[i], feature_importance[i]))

Pclass: 0.056887755102040784
Sex: 0.1923469387755102
Parch_I: 0.0045918367346938545
SibSp_I: 0.0033163265306122235
Cabin: 0.033418367346938746
Embarked: 0.003826530612244872
Age: 0.08877551020408163
difFare: 0.03724489795918366


In [22]:
np.where(np.isnan(test_t.to_numpy()))

(array([], dtype=int64), array([], dtype=int64))

In [23]:
predictions = rnd_forest.predict(test_t.fillna(0))


In [24]:
data_teste_pred = data_teste.copy()
data_teste_pred["Survived"] = predictions

data_teste_pred

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked,difFare,SibSp_I,Parch_I,Survived
0,892,3,male,34.500000,0,0,U,Q,-0.2208,L,L,0
1,893,3,female,47.000000,1,0,U,S,-1.0500,L,L,0
2,894,2,male,62.000000,0,0,U,Q,-9.0625,L,L,0
3,895,3,male,27.000000,0,0,U,S,0.6125,L,L,0
4,896,3,female,22.000000,1,1,U,S,4.2375,L,L,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,male,29.858135,0,0,U,S,0.0000,L,L,0
414,1306,1,female,39.000000,0,0,C,C,48.6125,L,L,1
415,1307,3,male,38.500000,0,0,U,S,-0.8000,L,L,0
416,1308,3,male,29.858135,0,0,U,S,0.0000,L,L,0


In [25]:
kaggle_file = data_teste_pred[["PassengerId", "Survived"]]

kaggle_file.to_csv("data/kaggle.csv", index = False)

Conseguimos obter uma acurácia de $78\%$ com esse modelo.