# Titanic Disaster - Kaggle Challenge

## Carregar dados

In [1]:
import random

import pandas

seed = 42
random.seed(seed)

train = pandas.read_csv('./train.csv')
test = pandas.read_csv('./test.csv')
test_ids = test['PassengerId']

train_x = train
train_y = train['Survived']

## Limpar e transformar os dados

### Definir utilitários para selecionar features por tipo

In [2]:
from pandas import DataFrame
from sklearn.base import BaseEstimator, TransformerMixin


class FilterAttributes(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_remove = None

    def fit(self, x: DataFrame, y: DataFrame = None):
        self.columns_to_remove = ['PassengerId', 'Cabin', 'Ticket', 'Name', 'Survived']
        return self

    def transform(self, x: DataFrame, y: DataFrame = None):
        return x.drop(self.columns_to_remove, axis=1, errors='ignore')


In [3]:
from abc import ABC, abstractmethod


class SelectAttributes(BaseEstimator, TransformerMixin, ABC):
    @abstractmethod
    def __init__(self, attr_type: str):
        self.selected_columns: DataFrame = None
        self.selected_type = attr_type

    def fit(self, x: DataFrame, y: DataFrame = None):
        self.selected_columns = x.select_dtypes(include=self.selected_type).columns
        return self

    def transform(self, x: DataFrame, y: DataFrame = None):
        return x[self.selected_columns]

In [4]:
class NumericAttributes(SelectAttributes):
    def __init__(self):
        super().__init__('number')


class CategoricalAttributes(SelectAttributes):
    def __init__(self):
        super().__init__('object')

In [5]:
from scipy.sparse import spmatrix


class DenseTransformer(TransformerMixin):

    def fit(self, x, y=None, **fit_params):
        return self

    def transform(self, x: spmatrix, y=None, **fit_params):
        return x.todense()

### Definir de pipelines para extração, imputation e unificação de features

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion

pipe_extract_number = Pipeline([
    ('attr_number', NumericAttributes()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

pipe_extract_categorical = Pipeline([
    ('attr_categorical', CategoricalAttributes()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

pipe_features_unify = FeatureUnion([
    ('join_number', pipe_extract_number),
    ('join_categorical', pipe_extract_categorical),
])

pipe_preprocessing = Pipeline([
    ('selected_attributes', FilterAttributes()),
    ('join_features', pipe_features_unify)
])

In [7]:
from sklearn.tree import DecisionTreeClassifier

build_final_pipe = lambda classifier: Pipeline([
    ('preprocessing', pipe_preprocessing),
    ('to_dense', DenseTransformer()),
    ('classifier', classifier),
])

In [8]:
import warnings

warnings.filterwarnings('ignore')

### Descobrir melhor combinação de parâmetros para o classificador árvore de decisão

In [9]:

from sklearn.model_selection import GridSearchCV

# Melhor: ({'max_depth': 7, 'max_leaf_nodes': 16, 'min_samples_leaf': 6}, 0.8305442219571905)
grid_params = {
    'tree__max_depth': [None] + list(range(1, 20, 2)),
    'tree__min_samples_leaf': [None] + list(range(1, 50, 5)),
    'tree__max_leaf_nodes': [None] + list(range(1, 50, 5)),
}

pipe_tree = build_final_pipe(DecisionTreeClassifier())

# 12 (número de threads da máquina) execuções em paralelos para acelerar as centenas de testes
grid_model = GridSearchCV(pipe_tree, param_grid=grid_params, n_jobs=12)
# grid_model.fit(train_x, train_y)
# cross_val_score = mean(cross_val_score(grid_model, train_x, train_y))
# cross_val_score, grid_model.best_params_, grid_model.best_score_

### Descobrir melhor combinação de parâmetros para o classificador KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

# Melhor: ({'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 11, 'p': 1, 'weights': 'uniform'}, 0.815956311593748)
grid_params = {
    'classifier__n_neighbors': list(range(1, 30, 2)),
    'classifier__p': [None, 1, 2],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'classifier__metric': ['minkowski', 'precomputed'],
}

pipe_knn = build_final_pipe(KNeighborsClassifier())

# 12 (número de threads da máquina) execuções em paralelos para acelerar as centenas de testes
grid_model = GridSearchCV(pipe_knn, param_grid=grid_params, n_jobs=12)
# grid_model.fit(train_x, train_y)
# cross_val_score = mean(cross_val_score(grid_model, train_x, train_y))
# cross_val_score, grid_model.best_params_, grid_model.best_score_

### Executar validação cruzada

In [11]:
from numpy import mean
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

voting_classifier = VotingClassifier([
    ('knn', KNeighborsClassifier(algorithm='auto', n_neighbors=11, p=1, weights='uniform')),
    ('naive_bayes', GaussianNB()),
    ('decision_tree', DecisionTreeClassifier(max_depth=7, max_leaf_nodes=16, min_samples_leaf=6, random_state=seed))
])
final_pipe = build_final_pipe(voting_classifier)
cross_val_score = mean(cross_val_score(final_pipe, train_x, train_y))
cross_val_score

0.8238152030632101

### Executar na base de teste


In [15]:
voting_classifier = VotingClassifier([
    ('knn', KNeighborsClassifier(algorithm='auto', n_neighbors=11, p=1, weights='uniform')),
    ('naive_bayes', GaussianNB()),
    ('decision_tree', DecisionTreeClassifier(max_depth=7, max_leaf_nodes=16, min_samples_leaf=6, random_state=seed))
])
final_pipe = build_final_pipe(voting_classifier)
final_pipe.fit_transform(train_x, train_y)
submission_prediction = final_pipe.predict(test)

results_dataframe = pandas.DataFrame({'PassengerId': test_ids, 'Survived': submission_prediction})
results_dataframe.to_csv('submission.csv', index=False)