In [50]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [31]:
df = pd.read_csv('data/train.csv')

In [32]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Tratando os Dados

In [34]:
# Preenchendo idade com a mediana e o porto de embarcação com a moda
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


# Feature Engineering

In [35]:
# Extrai o título do passageiro
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 
                                   'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

In [36]:
# Converte sexo para numérico
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)

In [37]:
# Criando uma feature 'FamilySize' que indica o tamanho da familia
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [38]:
# Cria variável IsAlone que verifica quem estava sozinho
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

In [39]:
# Converte embacação para numérico
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [40]:
# Faixa de tarifa
df['FareBand'] = pd.qcut(df['Fare'], 4, labels=[0, 1, 2, 3])

# Faixa de idade
df['AgeBand'] = pd.cut(df['Age'], 5, labels=[0, 1, 2, 3, 4])

In [41]:
# Codifica o título
le = LabelEncoder()
df['Title'] = le.fit_transform(df['Title'])

In [42]:
features = ['Pclass', 'Sex', 'AgeBand', 'FareBand', 'Embarked', 'FamilySize', 'IsAlone', 'Title']
X = df[features]
y = df['Survived']

In [43]:
X.head()

Unnamed: 0,Pclass,Sex,AgeBand,FareBand,Embarked,FamilySize,IsAlone,Title
0,3,0,1,0,0,2,0,2
1,1,1,2,3,1,2,0,3
2,3,1,1,1,0,1,1,1
3,1,1,2,3,0,2,0,3
4,3,0,2,1,0,1,1,2


# Separando treino e teste

In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


# Treinando uma Random Forest

In [45]:
# treinando uma random forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
# previsões
y_pred = model.predict(X_test)

In [47]:
# avaliando o modelo
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Acurácia: {acc:.4f}")
print("Matriz de Confusão:")
print(cm)
print("\nRelatório de Classificação:")
print(report)

Acurácia: 0.7985
Matriz de Confusão:
[[140  25]
 [ 29  74]]

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       165
           1       0.75      0.72      0.73       103

    accuracy                           0.80       268
   macro avg       0.79      0.78      0.79       268
weighted avg       0.80      0.80      0.80       268



# Refatorando o modelo para logar

In [48]:
# Vamos Refatorar o código para que todo o feature pipeline esteja no pickle do modelo

In [64]:
# Criando transformador customizado para extrair o título
class TitleExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        titles = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        titles = titles.replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
                                 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        titles = titles.replace(['Mlle', 'Ms'], 'Miss')
        titles = titles.replace('Mme', 'Mrs')
        return titles.to_frame()

In [56]:
# Criando transformador para features de família
class FamilyFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        family_size = X['SibSp'] + X['Parch'] + 1
        is_alone = (family_size == 1).astype(int)
        return pd.DataFrame({'FamilySize': family_size, 'IsAlone': is_alone})

In [65]:
# Criando título
title_pipeline = Pipeline([
    ('extract_title', TitleExtractor()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [66]:
# Criando FamilySize e IsAlone
family_pipeline = Pipeline([
    ('family', FamilyFeatures())
])

In [67]:
# Pipeline para colunas categóricas simples
categorical_cols = ['Sex', 'Embarked']
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [68]:
# Pipeline para colunas numéricas
numeric_cols = ['Fare', 'Age']
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('discretizer', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile'))
])

In [69]:
# Juntando tudo com ColumnTransformer
preprocessor = ColumnTransformer([
    ('title', title_pipeline, ['Name']),
    ('family', family_pipeline, ['SibSp', 'Parch']),
    ('cat', categorical_pipeline, categorical_cols),
    ('num', numeric_pipeline, numeric_cols),
    ('pclass', 'passthrough', ['Pclass'])  # já é numérica
])

In [70]:
# Pipeline completo com modelo
n_estimators = 100
max_depth = 5
model_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42))
])

# Avaliando Modelo Refatorado

In [93]:
# Dividir treino e validação
df = pd.read_csv('data/train.csv')
y = df['Survived']
X = df.drop(columns=['Survived', 'PassengerId', 'Ticket', 'Cabin'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Treinar modelo
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)



In [95]:
# avaliando o modelo
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Acurácia: {acc:.4f}")
print("Matriz de Confusão:")
print(cm)
print("\nRelatório de Classificação:")
print(report)

Acurácia: 0.8246
Matriz de Confusão:
[[142  15]
 [ 32  79]]

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       157
           1       0.84      0.71      0.77       111

    accuracy                           0.82       268
   macro avg       0.83      0.81      0.81       268
weighted avg       0.83      0.82      0.82       268



# Logando o modelo no Mlflow

In [96]:
# PS: Vou expor as envs aqui por facilidade!
import os
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"
os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("TitanicPipeline")

<Experiment: artifact_location='s3://mlflow/5', creation_time=1754418132215, experiment_id='5', last_update_time=1754418132215, lifecycle_stage='active', name='TitanicPipeline', tags={}>

In [97]:
with mlflow.start_run():

    # Treinar pipeline
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    # Avaliar
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)

    # Logar parâmetros
    mlflow.log_param("model_type", "RandomForestPipeline")
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    # Logar métricas
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", report["weighted avg"]["precision"])
    mlflow.log_metric("recall", report["weighted avg"]["recall"])
    mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])

    # Logar pipeline completo
    
    
    signature = infer_signature(X_test, y_pred)
    
    mlflow.sklearn.log_model(
        sk_model=model_pipeline,
        name="titanic_model",
        signature=signature
    )

    print("Pipeline completo logado no MLflow com sucesso!")



Pipeline completo logado no MLflow com sucesso!
🏃 View run bright-hen-772 at: http://localhost:5000/#/experiments/5/runs/65ebb038de7f4b158dc1751f223d208d
🧪 View experiment at: http://localhost:5000/#/experiments/5


In [99]:
X_train.iloc[1].to_json()

'{"Pclass":3,"Name":"Mitkoff, Mr. Mito","Sex":"male","Age":null,"SibSp":0,"Parch":0,"Fare":7.8958,"Embarked":"S"}'

In [98]:
y_train

445    1
650    0
172    1
450    0
314    0
      ..
106    1
270    0
860    0
435    1
102    0
Name: Survived, Length: 623, dtype: int64