# Preprocess

In [39]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from os import path
from datetime import datetime

##### Funções

In [40]:
def get_dataset(in_path):
    """
    Creates new features from different columns and exports to csv file.
    """
    # Create alone and baby columns
    dataset = pd.read_csv(in_path)

    dataset["Alone"] = np.logical_and(
        (dataset["SibSp"].eq(0)), (dataset["Parch"].eq(0))
    ).astype(int)
    dataset["Baby"] = dataset["Age"].le(6).astype(int)

    return dataset

##### Variáveis e Constantes

In [41]:
DATA_FOLDER = "data"
RAW_FOLDER = "raw"
PROCESSED_FOLDER = "processed"
OUTPUT_FOLDER = "output"
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"

input_train = path.join(DATA_FOLDER, RAW_FOLDER, TRAIN_FILE)
input_test = path.join(DATA_FOLDER, RAW_FOLDER, TEST_FILE)

# Prepare pipeline steps.
# Transformer for categorical features
categorical_features = ["Pclass", "Sex"]
categorical_transformer = Pipeline(
    [
        ("imputer_cat", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Transformer for numerical features.
# We use MinMaxScaler because of our data distribution
numeric_features = ["Age", "Fare"]
numeric_transformer = Pipeline(
    [("imputer_num", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler())]
)

# Transformer for numerical to categorical features
num_cat_features = ["Alone", "Baby"]
num_cat_transformer = Pipeline(
    [
        ("imputer_num_cat", SimpleImputer(strategy="most_frequent")),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("categoricals", categorical_transformer, categorical_features),
        ("numericals", numeric_transformer, numeric_features),
        ("num_to_cat", num_cat_transformer, num_cat_features),
    ],
    remainder="drop",
)

##### Ingestão

In [42]:
# Ingest datasets

train = get_dataset(input_train, "train")
test = get_dataset(input_test, "test")

X = train.drop("Survived", axis=1)
y = train["Survived"]

##### Regressão Logística

In [43]:
# Variáveis e parametros

lr_pipeline = Pipeline(
    [
        ("preprocessing", preprocessor),
        ("lr", LogisticRegression()),
    ]
)

lr_params = {
    "lr__penalty": ["l1", "l2"],
    "lr__C": [0.01, 0.1, 1, 10, 100],
    "lr__random_state": [42],
    "lr__solver": ["liblinear"],
}

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

lr_cv = GridSearchCV(
    lr_pipeline,
    lr_params,
    cv=rskf,
    scoring=["f1", "accuracy"],
    refit="f1",
    n_jobs=-1,
)

In [44]:
lr_cv.fit(X, y)
print(f"Best F1-score: {lr_cv.best_score_:.3f}\n")
print(f"Best parameter set: {lr_cv.best_params_}\n")
print(f"Scores: {classification_report(y, lr_cv.predict(X))}")

Best F1-score: 0.726

Best parameter set: {'lr__C': 1, 'lr__penalty': 'l1', 'lr__random_state': 42, 'lr__solver': 'liblinear'}

Scores:               precision    recall  f1-score   support

           0       0.83      0.87      0.85       549
           1       0.77      0.71      0.74       342

    accuracy                           0.81       891
   macro avg       0.80      0.79      0.79       891
weighted avg       0.81      0.81      0.81       891



##### Random Forest

In [45]:
# Variáveis e parametros

rf_pipeline = Pipeline(
    [("preprocessing", preprocessor), ("rf", RandomForestClassifier())]
)

rf_params = {
    "rf__n_estimators": [100, 120, 150],
    "rf__criterion": ["entropy", "gini"],
    "rf__max_depth": [4, 5, 6],
    "rf__min_samples_leaf": [0.05, 0.1, 0.2],
    "rf__min_samples_split": [0.05, 0.1, 0.2],
    "rf__random_state": [42],
}

rf_cv = GridSearchCV(
    rf_pipeline,
    rf_params,
    cv=rskf,
    scoring=["f1", "accuracy"],
    refit="f1",
    n_jobs=-1,
)

In [46]:
rf_cv.fit(X, y)
print(f"Best F1-score: {rf_cv.best_score_:.3f}\n")
print(f"Best parameter set: {rf_cv.best_params_}\n")
print(f"Scores: {classification_report(y, rf_cv.predict(X))}")

Best F1-score: 0.710

Best parameter set: {'rf__criterion': 'entropy', 'rf__max_depth': 4, 'rf__min_samples_leaf': 0.2, 'rf__min_samples_split': 0.05, 'rf__n_estimators': 120, 'rf__random_state': 42}

Scores:               precision    recall  f1-score   support

           0       0.81      0.85      0.83       549
           1       0.74      0.68      0.71       342

    accuracy                           0.79       891
   macro avg       0.78      0.77      0.77       891
weighted avg       0.78      0.79      0.78       891



##### AdaBoost

In [47]:
# Variáveis e parametros

ada_pipeline = Pipeline(
    [("preprocessing", preprocessor), ("ada", AdaBoostClassifier())]
)

# Usando alguns dos parametros encontrados antes.
ada_params = {
    'ada__estimator': [
        LogisticRegression(penalty='l2', C=1, random_state=42, solver='liblinear'),
        DecisionTreeClassifier(
            criterion='gini', max_depth=5, min_samples_leaf=0.05,
            min_samples_split=0.2, random_state=42
        ),
    ],
    'ada__n_estimators': [30, 40, 50, 70, 120],
    'ada__learning_rate': [1.0, 0.5, 0.3, 0.1]
}

ada_cv = GridSearchCV(
    ada_pipeline,
    ada_params,
    cv=rskf,
    scoring=["f1", "accuracy"],
    refit="f1",
    n_jobs=-1,
)

In [48]:
ada_cv.fit(X, y)
print(f"Best F1-score: {ada_cv.best_score_:.3f}\n")
print(f"Best parameter set: {ada_cv.best_params_}\n")
print(f"Scores: {classification_report(y, ada_cv.predict(X))}")

Best F1-score: 0.771

Best parameter set: {'ada__estimator': DecisionTreeClassifier(max_depth=5, min_samples_leaf=0.05,
                       min_samples_split=0.2, random_state=42), 'ada__learning_rate': 0.3, 'ada__n_estimators': 120}

Scores:               precision    recall  f1-score   support

           0       0.89      0.94      0.92       549
           1       0.90      0.82      0.86       342

    accuracy                           0.90       891
   macro avg       0.90      0.88      0.89       891
weighted avg       0.90      0.90      0.89       891



##### Envio e considerações

O melhor modelo treinado foi o Adaboost usando DecisionTree como modelo base, com os parâmetros achados anteriormente no treino da RF. Notando que estamos apenas usando os dados do dataset de treino, e faremos o teste real posteriormente com os envios pro Kaggle.

Enviaremos os datasets de todos os modelos treinados para então comparar seus resultados de teste.

Devido ao tempo relativamente curto, não foram testados outros modelos, ou mais hiper-parâmetros ainda. Além disso, poderia ser feita um melhor desenvolvimento na parte de Feature Engineering. Também poderia ser usado o MLflow para auxiliar nas comparações e armazenamento de informações.

In [53]:
output = test.loc[:, ["PassengerId"]]
output["Survived"] = rf_cv.predict(test)
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [54]:
output_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
output_id = "submission_rf_cvs_lp_" + output_time + ".csv"
output_path = path.join(DATA_FOLDER, OUTPUT_FOLDER, output_id)
output.to_csv(output_path, sep=",", index_label=False, index=False)

##### Resultados

Cada um dos modelos foi enviado, com scores muito próximos entre 0.76 e 0.77. Adaboost foi melhor no treino, mas performou semelhante aos outros modelos.