### Imports usados no projeto

In [None]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

### Coleta e tratamento dos dados

In [None]:
data = pd.read_csv('database/train.csv')

In [None]:
X, y = data.drop('Transported', axis=1), data['Transported']

In [None]:
train = pd.read_csv('database/train.csv')
test = pd.read_csv('database/test.csv')

In [None]:
class AtributosDesejados(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Name', 'Cabin']
        return self
    def transform(self, X, y=None):
        return X.drop(self.colunasIndesejadas, axis=1)

In [None]:
class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas]

In [None]:
pipenum = Pipeline([
    ('atributos_numericos', AtributosNumericos()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas]

In [None]:
pipecat = Pipeline([
    ('atributos_categoricos', AtributosCategoricos()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

In [None]:
uneCaracteristicas = FeatureUnion([
    ('pipenum', pipenum),
    ('pipecat', pipecat)
])

In [None]:
preproc = Pipeline([
    ('atributos_desejados', AtributosDesejados()),
    ('unir_caracteristicas', uneCaracteristicas)
])

### Usando somente o DecisionTreeClassifier

In [None]:
pipetotal = Pipeline([
    ('preproc', preproc),
    ('arvore', DecisionTreeClassifier())
])

In [None]:
pipetotal.fit(X, y)
ypred = pipetotal.predict(X)
accuracy_score(y, ypred)

In [None]:
scores = cross_validate(pipetotal, X, y)
scores, np.mean(scores['test_score'])

In [None]:
ypred = pipetotal.predict(test)

submission = pd.read_csv('database/sample_submission.csv')
submission['Transported'] = ypred
submission.to_csv('results/tree_submission.csv', index=False)

### Usando o DecisionTreeClassifier e GridSearchCV

In [None]:
parametros = {
    'arvore__max_depth': [None] + list(range(1,20,2)), 
    'preproc__unir_caracteristicas__pipenum__imputer__strategy': ['mean', 'median']
}

modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y)
scores, np.mean(scores['test_score'])

In [None]:
modelo.fit(X,y)
modelo.best_estimator_

In [None]:
ypred = modelo.predict(test)

submission = pd.read_csv('database/sample_submission.csv')
submission['Transported'] = ypred
submission.to_csv('results/tree_gscv_submission.csv', index=False)

### Usando um novo DecisionTreeClassifier e GridSearchCV

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipetotal = Pipeline([
    ('preproc', preproc),
    ('arvore', DecisionTreeClassifier())
])

parametros = {
    'arvore__max_depth': [None] + list(range(1, 20, 2)),
    'arvore__min_samples_split': [2, 5, 10],  # Example: Add more hyperparameters
    'preproc__unir_caracteristicas__pipenum__imputer__strategy': ['mean', 'median', 'most_frequent']
}

modelo = GridSearchCV(pipetotal, param_grid=parametros, cv=5)  # Specify cv explicitly

modelo.fit(X,y)
ypred = modelo.predict(test)

submission = pd.read_csv('database/sample_submission.csv')
submission['Transported'] = ypred
submission.to_csv('results/tree_gscv_new_submission.csv', index=False)

### Usando LogisticRegression e GridSearchCV

In [None]:
pipetotal = Pipeline([
    ('preproc', preproc),
    ('logreg', LogisticRegression())
])

parametros = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],  
    'logreg__max_iter': [100, 200, 300], 
    'preproc__unir_caracteristicas__pipenum__imputer__strategy': ['mean', 'median', 'most_frequent']
}


modelo = GridSearchCV(pipetotal, param_grid=parametros)  


modelo.fit(X,y)

ypred = modelo.predict(test)

submission = pd.read_csv('database/sample_submission.csv')
submission['Transported'] = ypred
submission.to_csv('results/logreg_gscv_submission.csv', index=False)

## Resultados

Esses foram os resultados obtidos

![Resultados](image/image.png)