In [None]:
import pandas as pd
data = pd.read_csv('datasets/train.csv')
data.dtypes

In [None]:
y =  data['Survived']
y

In [None]:
X = data.drop('Survived', axis=1)
X.columns

In [None]:
for column in X.columns:
    print(f"{column:>12}: {len(set(X[column])):4} {X[column].dtype}")

In [None]:
indesejadas = ['PassengerId', 'Name', 'Ticket', 'Cabin']
Xdrop = X.drop(indesejadas, axis=1)
Xdrop.columns

In [None]:
Xnum = Xdrop.select_dtypes('number')
Xnum.columns

In [None]:
for column in Xnum.columns:
    print(f"{column:>12}: {sum(Xnum[column].isnull())}")

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
XnumLimpo = imputer.fit_transform(Xnum)
XnumLimpo

In [None]:
Xcat = Xdrop.select_dtypes('object')
Xcat.columns

In [None]:
for column in Xcat.columns:
     print(f"{column:>12}: {sum(Xcat[column].isnull())}")

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
XcatLimpo = imputer.fit_transform(Xcat)
XcatLimpo

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
XcatHot = encoder.fit_transform(XcatLimpo)
XcatHot

In [None]:
import numpy as np 
Xtratado = np.c_[XnumLimpo, XcatHot.toarray()]
Xtratado.shape

In [None]:
import pandas as pd
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')
train.columns.isin(test.columns)

In [None]:
train.columns[~train.columns.isin(test.columns)]

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Name', 'Ticket', 'Cabin']
        return self
    def transform(self, X, y=None):
        return X.drop(self.colunasIndesejadas, axis=1)

atributosDesejados = AtributosDesejados()
Xdrop = atributosDesejados.fit_transform(X)
Xdrop.columns

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas]

atributosNumericos = AtributosNumericos()
Xnum = atributosNumericos.fit_transform(Xdrop)
Xnum.columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipenum = Pipeline([
    ('atributos_numericos', AtributosNumericos()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

XnumLimpo = pipenum.fit_transform(Xnum)
XnumLimpo

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas]

atributosCategoricos = AtributosCategoricos()
Xcat = atributosCategoricos.fit_transform(Xdrop)
Xcat.columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

pipecat = Pipeline([
    ('atributos_categoricos', AtributosCategoricos()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

XcatLimpo = pipecat.fit_transform(Xcat)
XcatLimpo

In [None]:
from sklearn.pipeline import FeatureUnion

uneCaracteristicas = FeatureUnion([
    ('pipenum', pipenum),
    ('pipecat', pipecat)
])

Xtratado = uneCaracteristicas.fit_transform(Xdrop)
Xtratado

In [None]:
from sklearn.pipeline import Pipeline

preproc = Pipeline([
    ('atributos_desejados', AtributosDesejados()),
    ('unir_caracteristicas', uneCaracteristicas)
])

Xtratado = preproc.fit_transform(X)
Xtratado

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipetotal = Pipeline([
    ('preproc', preproc),
    ('arvore', DecisionTreeClassifier())
])

In [None]:
from sklearn.metrics import accuracy_score

pipetotal.fit(X, y)
ypred = pipetotal.predict(X)
accuracy_score(y, ypred)

In [None]:
from sklearn.model_selection import cross_validate
import numpy as np

scores = cross_validate(pipetotal, X, y)
scores, np.mean(scores['test_score'])

In [None]:
from sklearn.model_selection import GridSearchCV

parametros = {
    'arvore__max_depth': [None] + list(range(1,20,2)),
    'arvore__criterion': ['gini', 'entropy']
}

modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y)
scores, np.mean(scores['test_score'])