### Imports usados no projeto

In [1]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

### Coleta e tratamento dos dados

In [2]:
data = pd.read_csv('database/train.csv')
data.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [3]:
X, y = data.drop('Transported', axis=1), data['Transported']

In [28]:
X.shape, y.shape, set(y)

((8693, 13), (8693,), {False, True})

In [5]:
y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [6]:
X.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')

In [7]:
for column in X.columns:
    print(f"{column:>12}: {len(set(X[column])):4} {X[column].dtype}")

 PassengerId: 8693 object
  HomePlanet:    4 object
   CryoSleep:    3 object
       Cabin: 6561 object
 Destination:    4 object
         Age:  259 float64
         VIP:    3 object
 RoomService: 1454 float64
   FoodCourt: 1690 float64
ShoppingMall: 1323 float64
         Spa: 1510 float64
      VRDeck: 1494 float64
        Name: 8474 object


In [8]:
train = pd.read_csv('database/train.csv')
test = pd.read_csv('database/test.csv')

In [9]:
train.columns[~train.columns.isin(test.columns)]

Index(['Transported'], dtype='object')

In [10]:
class AtributosDesejados(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Name', 'Cabin']
        return self
    def transform(self, X, y=None):
        return X.drop(self.colunasIndesejadas, axis=1)

In [11]:
class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas]

In [12]:
pipenum = Pipeline([
    ('atributos_numericos', AtributosNumericos()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [13]:
class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas]

In [14]:
pipecat = Pipeline([
    ('atributos_categoricos', AtributosCategoricos()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

In [15]:
uneCaracteristicas = FeatureUnion([
    ('pipenum', pipenum),
    ('pipecat', pipecat)
])

In [16]:
preproc = Pipeline([
    ('atributos_desejados', AtributosDesejados()),
    ('unir_caracteristicas', uneCaracteristicas)
])

### Usando somente o DecisionTreeClassifier

In [17]:
pipetotal = Pipeline([
    ('preproc', preproc),
    ('arvore', DecisionTreeClassifier())
])

In [18]:
pipetotal.fit(X, y)
ypred = pipetotal.predict(X)
accuracy_score(y, ypred)

0.9293684573795008

In [19]:
scores = cross_validate(pipetotal, X, y)
scores, np.mean(scores['test_score'])

({'fit_time': array([0.19258189, 0.21857476, 0.18917775, 0.20647979, 0.2060926 ]),
  'score_time': array([0.00998616, 0.01341081, 0.01099038, 0.01673317, 0.01423454]),
  'test_score': array([0.73548016, 0.74813111, 0.73490512, 0.74453395, 0.73820483])},
 np.float64(0.7402510337872579))

In [20]:
ypred = pipetotal.predict(test)

submission = pd.read_csv('database/sample_submission.csv')
submission['Transported'] = ypred
submission.to_csv('results/tree_submission.csv', index=False)

### Usando o DecisionTreeClassifier e GridSearchCV

In [21]:
parametros = {
    'arvore__max_depth': [None] + list(range(1,20,2)), 
    'preproc__unir_caracteristicas__pipenum__imputer__strategy': ['mean', 'median']
}

modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y)
scores, np.mean(scores['test_score'])

({'fit_time': array([10.21145725,  9.90243173,  9.30400944,  9.14928293,  9.42926598]),
  'score_time': array([0.00710559, 0.00795698, 0.0113492 , 0.00760007, 0.0089426 ]),
  'test_score': array([0.77285796, 0.77343301, 0.77975848, 0.78250863, 0.78135788])},
 np.float64(0.777983193388526))

In [22]:
modelo.fit(X,y)
modelo.best_estimator_

In [23]:
ypred = modelo.predict(test)

submission = pd.read_csv('database/sample_submission.csv')
submission['Transported'] = ypred
submission.to_csv('results/tree_gscv_submission.csv', index=False)

### Usando um novo DecisionTreeClassifier e GridSearchCV

In [24]:
from sklearn.tree import DecisionTreeClassifier

pipetotal = Pipeline([
    ('preproc', preproc),
    ('arvore', DecisionTreeClassifier())
])

parametros = {
    'arvore__max_depth': [None] + list(range(1, 20, 2)),
    'arvore__min_samples_split': [2, 5, 10],
    'preproc__unir_caracteristicas__pipenum__imputer__strategy': ['mean', 'median', 'most_frequent']
}

modelo = GridSearchCV(pipetotal, param_grid=parametros, cv=5)

scores = cross_validate(modelo, X, y)
scores, np.mean(scores['test_score'])

({'fit_time': array([41.93894172, 42.57581306, 40.77125144, 41.01903677, 43.60525179]),
  'score_time': array([0.01016903, 0.00997806, 0.00975132, 0.0111599 , 0.01160502]),
  'test_score': array([0.77343301, 0.77285796, 0.77688327, 0.78135788, 0.78135788])},
 np.float64(0.7771780006630532))

In [25]:
modelo.fit(X,y)
ypred = modelo.predict(test)

submission = pd.read_csv('database/sample_submission.csv')
submission['Transported'] = ypred
submission.to_csv('results/tree_gscv_new_submission.csv', index=False)

### Usando LogisticRegression e GridSearchCV

In [26]:
pipetotal = Pipeline([
    ('preproc', preproc),
    ('logreg', LogisticRegression())
])

parametros = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],  
    'logreg__max_iter': [100, 200, 300], 
    'preproc__unir_caracteristicas__pipenum__imputer__strategy': ['mean', 'median', 'most_frequent']
}


modelo = GridSearchCV(pipetotal, param_grid=parametros)  

scores = cross_validate(modelo, X, y)
scores, np.mean(scores['test_score'])

({'fit_time': array([11.0956738 , 10.92000723, 10.85327244, 10.85670376, 11.14686275]),
  'score_time': array([0.00617766, 0.00800657, 0.00696659, 0.00805736, 0.00900531]),
  'test_score': array([0.78550891, 0.78550891, 0.77975848, 0.78193326, 0.79459148])},
 np.float64(0.7854602098609639))

In [27]:
modelo.fit(X,y)

ypred = modelo.predict(test)

submission = pd.read_csv('database/sample_submission.csv')
submission['Transported'] = ypred
submission.to_csv('results/logreg_gscv_submission.csv', index=False)

## Resultados

Esses foram os resultados obtidos

![Resultados](image/image.png)