# Titanic - Modelo

texto texto texto

In [219]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

from feature_engine import categorical_encoders as ce

plt.style.use("ggplot")

## Lê os dados

In [281]:
data_train = pd.read_csv("data/train-tratado.csv")
# Removendo o atributo fare dada a sua alta correlação com o atributo difFare
data_train = data_train.drop("Fare", axis = 1)

data_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked,difFare
0,0,3,male,22.000000,1,0,U,S,-0.8000
1,1,1,female,38.000000,1,0,C,C,10.9958
2,1,3,female,26.000000,0,0,U,S,-0.1250
3,1,1,female,35.000000,1,0,C,S,-7.1875
4,0,3,male,35.000000,0,0,U,S,0.0000
...,...,...,...,...,...,...,...,...,...
779,0,3,female,39.000000,0,5,U,Q,21.0750
780,1,1,female,19.000000,0,0,B,S,-30.2875
781,0,3,female,30.626179,1,2,U,S,15.4000
782,1,1,male,26.000000,0,0,C,C,-30.2875


In [282]:
data_teste = pd.read_csv("data/teste-tratado.csv")
data_teste = data_teste.drop("Fare", axis = 1)

data_teste

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked,difFare
0,892,3,male,34.500000,0,0,U,Q,-0.2208
1,893,3,female,47.000000,1,0,U,S,-1.0500
2,894,2,male,62.000000,0,0,U,Q,-9.0625
3,895,3,male,27.000000,0,0,U,S,0.6125
4,896,3,female,22.000000,1,1,U,S,4.2375
...,...,...,...,...,...,...,...,...,...
413,1305,3,male,29.858135,0,0,U,S,0.0000
414,1306,1,female,39.000000,0,0,C,C,48.6125
415,1307,3,male,38.500000,0,0,U,S,-0.8000
416,1308,3,male,29.858135,0,0,U,S,0.0000


In [283]:
data_train.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin',
       'Embarked', 'difFare'],
      dtype='object')

In [284]:
atributos_categoricos = ["Pclass", "Sex", "Parch", "SibSp", "Cabin", "Embarked"]

for atributo in atributos_categoricos:
    data_train[atributo] = data_train[atributo].astype("O")
    data_teste[atributo] = data_teste[atributo].astype("O")

In [285]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 784 entries, 0 to 783
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  784 non-null    int64  
 1   Pclass    784 non-null    object 
 2   Sex       784 non-null    object 
 3   Age       784 non-null    float64
 4   SibSp     784 non-null    object 
 5   Parch     784 non-null    object 
 6   Cabin     784 non-null    object 
 7   Embarked  784 non-null    object 
 8   difFare   784 non-null    float64
dtypes: float64(2), int64(1), object(6)
memory usage: 55.2+ KB


In [286]:
atributos_continuos = ["Age", "Fare", "difFare"]

## Separação no conjunto de treino para testar o modelo

In [287]:
X_train = data_train.drop("Survived", axis = 1)
y_train = data_train["Survived"]

## Aplicando o `frequency enconder` nos atributos categóricos

In [288]:
encoder = ce.CountFrequencyCategoricalEncoder(encoding_method = "frequency", variables = atributos_categoricos)

encoder

CountFrequencyCategoricalEncoder(encoding_method='frequency',
                                 variables=['Pclass', 'Sex', 'Parch', 'SibSp',
                                            'Cabin', 'Embarked'])

In [289]:
encoder.fit(X_train)

CountFrequencyCategoricalEncoder(encoding_method='frequency',
                                 variables=['Pclass', 'Sex', 'Parch', 'SibSp',
                                            'Cabin', 'Embarked'])

In [290]:
# Dados com enconder

train_t = encoder.transform(X_train)

In [291]:
train_t.sort_index()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked,difFare
0,0.516582,0.626276,22.000000,0.256378,0.739796,0.741071,0.727041,-0.8000
1,0.272959,0.373724,38.000000,0.256378,0.739796,0.075255,0.197704,10.9958
2,0.516582,0.373724,26.000000,0.659439,0.739796,0.741071,0.727041,-0.1250
3,0.272959,0.373724,35.000000,0.256378,0.739796,0.075255,0.727041,-7.1875
4,0.516582,0.626276,35.000000,0.659439,0.739796,0.741071,0.727041,0.0000
...,...,...,...,...,...,...,...,...
779,0.516582,0.373724,39.000000,0.659439,0.006378,0.741071,0.075255,21.0750
780,0.272959,0.373724,19.000000,0.659439,0.739796,0.058673,0.727041,-30.2875
781,0.516582,0.373724,30.626179,0.256378,0.095663,0.741071,0.727041,15.4000
782,0.272959,0.626276,26.000000,0.659439,0.739796,0.075255,0.197704,-30.2875


In [292]:
encoder.encoder_dict_

{'Pclass': {3: 0.5165816326530612,
  1: 0.2729591836734694,
  2: 0.21045918367346939},
 'Sex': {'male': 0.6262755102040817, 'female': 0.3737244897959184},
 'Parch': {0: 0.7397959183673469,
  1: 0.14540816326530612,
  2: 0.09566326530612244,
  5: 0.006377551020408163,
  3: 0.006377551020408163,
  4: 0.00510204081632653,
  6: 0.0012755102040816326},
 'SibSp': {0: 0.6594387755102041,
  1: 0.25637755102040816,
  2: 0.03443877551020408,
  4: 0.02295918367346939,
  3: 0.017857142857142856,
  5: 0.006377551020408163,
  8: 0.002551020408163265},
 'Cabin': {'U': 0.7410714285714286,
  'C': 0.07525510204081633,
  'B': 0.058673469387755105,
  'D': 0.042091836734693876,
  'E': 0.04081632653061224,
  'A': 0.01913265306122449,
  'F': 0.016581632653061226,
  'G': 0.00510204081632653,
  'T': 0.0012755102040816326},
 'Embarked': {'S': 0.7270408163265306,
  'C': 0.19770408163265307,
  'Q': 0.07525510204081633}}

## Naive Bayes Heterogêneo

## Decision Tree

## Random Forest

In [293]:
rnd_forest = RandomForestClassifier(random_state = 2)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 5, 6, 7, 8, 10],
    'criterion' :['gini', 'entropy']
}

CV_rndf = GridSearchCV(estimator = rnd_forest, param_grid = param_grid, cv = 5)

In [294]:
CV_rndf.fit(train_t, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8, 10],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [295]:
CV_rndf.best_params_

{'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 'log2',
 'n_estimators': 200}

## Aplicando o melhor modelo encontrando em todo o conjunto de treino

In [296]:
# Separa o conjunto de dados

X_train = data_train.drop("Survived", axis = 1)
X_test = data_teste.drop("PassengerId", axis = 1)
y_train = data_train["Survived"]

In [297]:
# Aplica o enconder

encoder = ce.CountFrequencyCategoricalEncoder(encoding_method = "frequency", variables = atributos_categoricos)
encoder.fit(X_train)

# Dados com enconder

train_t = encoder.transform(X_train)
test_t = encoder.transform(X_test)

In [298]:
rnd_forest = RandomForestClassifier(random_state = 2, n_estimators = 200, criterion = "entropy", max_depth = 6, max_features = "log2")
rnd_forest = rnd_forest.fit(train_t, y_train)

rnd_forest

RandomForestClassifier(criterion='entropy', max_depth=6, max_features='log2',
                       n_estimators=200, random_state=2)

In [299]:
np.where(np.isnan(test_t.to_numpy()))

(array([342, 365]), array([4, 4]))

In [300]:
test_t.iloc[342]

Pclass       0.516582
Sex          0.626276
Age         29.858135
SibSp        0.256378
Parch             NaN
Cabin        0.741071
Embarked     0.727041
difFare     61.500000
Name: 342, dtype: float64

In [301]:
predictions = rnd_forest.predict(test_t.fillna(0))


In [302]:
data_teste_pred = data_teste.copy()
data_teste_pred["Survived"] = predictions

data_teste_pred

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked,difFare,Survived
0,892,3,male,34.500000,0,0,U,Q,-0.2208,0
1,893,3,female,47.000000,1,0,U,S,-1.0500,0
2,894,2,male,62.000000,0,0,U,Q,-9.0625,0
3,895,3,male,27.000000,0,0,U,S,0.6125,0
4,896,3,female,22.000000,1,1,U,S,4.2375,0
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,male,29.858135,0,0,U,S,0.0000,0
414,1306,1,female,39.000000,0,0,C,C,48.6125,1
415,1307,3,male,38.500000,0,0,U,S,-0.8000,0
416,1308,3,male,29.858135,0,0,U,S,0.0000,0


In [303]:
kaggle_file = data_teste_pred[["PassengerId", "Survived"]]

kaggle_file.to_csv("data/kaggle.csv", index = False)