In [198]:
import pandas as pd
train = pd.read_csv('/content/drive/MyDrive/Datasets/titanic/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Datasets/titanic/test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]


In [199]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [200]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True):
        self.excluirName = excluirName
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        return Xdrop


In [201]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()


In [202]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()


In [203]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [204]:

modelo.best_params_

{'atributosDesejados__excluirName': False,
 'classificador__eta': 0.06,
 'classificador__gamma': 0.2,
 'classificador__max_depth': 5,
 'classificador__n_estimators': 70,
 'classificador__reg_lambda': 1}

In [205]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador',  XGBClassifier())
])

parametros = {
    'atributosDesejados__excluirName': [True, False],
     'classificador__max_depth': [5],
    "classificador__n_estimators":[70],
    'classificador__reg_lambda':[1],
    'classificador__gamma':[0.2],
    'classificador__eta':[0.06]
}
modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y, scoring='roc_auc_ovo_weighted',cv=RepeatedKFold(n_splits=20, n_repeats=3, random_state=10),n_jobs=-1)
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

(array([0.86298077, 0.84051724, 0.96230159, 0.91666667, 0.88888889,
        0.91563786, 0.87931034, 0.85084034, 0.89915966, 0.89676113,
        0.81719368, 0.86547619, 0.76897321, 0.80991736, 0.87581699,
        0.91190476, 0.87606838, 0.89166667, 0.8125    , 0.97142857,
        0.88492063, 0.82888889, 0.73989899, 0.93849206, 0.83502024,
        0.89888889, 0.88683128, 0.85119048, 0.85666667, 0.86842105,
        0.97777778, 0.85921325, 0.86324786, 0.91666667, 0.97701149,
        0.91735537, 0.81640625, 0.87286325, 0.84161491, 0.88275862,
        0.93686869, 0.856     , 0.90651261, 0.88663968, 0.82444444,
        0.80777778, 0.84670782, 0.88888889, 0.95798319, 0.91777778,
        0.90688259, 0.85379464, 0.77619048, 0.79910714, 0.84151786,
        0.85977011, 0.84597701, 0.8540305 , 0.92094017, 0.85894737]),
 0.8729150612274404,
 0.04984094721669545)

In [None]:
modelo.fit(X,y)
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)