In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("TitanicExperiment2")

<Experiment: artifact_location='/mlflow/mlruns/525494945829811883', creation_time=1754363404292, experiment_id='525494945829811883', last_update_time=1754363404292, lifecycle_stage='active', name='TitanicExperiment2', tags={}>

In [3]:
df = pd.read_csv('data/train.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Tratando os Dados

In [6]:
df.drop(columns=['Cabin', 'Ticket', 'Name', 'PassengerId'], inplace=True)

In [7]:
# Preenchendo idade com a mediana e o porto de embarcação com a moda
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


# Feature Engineering

In [8]:
# Criando uma feature 'FamilySize' que indica o tamanho da familia
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [9]:
# Codificando colunas categóricas
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])         # male=1, female=0
df['Embarked'] = le.fit_transform(df['Embarked'])  # C=0, Q=1, S=2

In [10]:
# Normalizando as features
scaler = StandardScaler()
X = df.drop(columns=['Survived', 'SibSp', 'Parch'])

In [11]:
X[X.columns]= scaler.fit_transform(X)

In [12]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0.827377,0.737695,-0.565736,-0.502445,0.585954,0.05916
1,-1.566107,-1.355574,0.663861,0.786845,-1.942303,0.05916
2,0.827377,-1.355574,-0.258337,-0.488854,0.585954,-0.560975
3,-1.566107,-1.355574,0.433312,0.42073,0.585954,0.05916
4,0.827377,0.737695,0.433312,-0.486337,0.585954,-0.560975


In [13]:
y = df['Survived']

# Separando treino e teste

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [15]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [16]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 748 to 136
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      623 non-null    float64
 1   Sex         623 non-null    float64
 2   Age         623 non-null    float64
 3   Fare        623 non-null    float64
 4   Embarked    623 non-null    float64
 5   FamilySize  623 non-null    float64
 6   Survived    623 non-null    int64  
dtypes: float64(6), int64(1)
memory usage: 38.9 KB


In [17]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 268 entries, 625 to 387
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      268 non-null    float64
 1   Sex         268 non-null    float64
 2   Age         268 non-null    float64
 3   Fare        268 non-null    float64
 4   Embarked    268 non-null    float64
 5   FamilySize  268 non-null    float64
 6   Survived    268 non-null    int64  
dtypes: float64(6), int64(1)
memory usage: 16.8 KB


# Treinando uma Random Forest

In [18]:
# treinando uma random forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
# previsões
y_pred = model.predict(X_test)

In [20]:
# avaliando o modelo
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Acurácia: {acc:.4f}")
print("Matriz de Confusão:")
print(cm)
print("\nRelatório de Classificação:")
print(report)

Acurácia: 0.7985
Matriz de Confusão:
[[145  20]
 [ 34  69]]

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       165
           1       0.78      0.67      0.72       103

    accuracy                           0.80       268
   macro avg       0.79      0.77      0.78       268
weighted avg       0.80      0.80      0.80       268



# Treinando uma Regressão Logística

In [21]:
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [22]:
y_pred_log = logreg.predict(X_test)

In [23]:
acc = accuracy_score(y_test, y_pred_log)
cm = confusion_matrix(y_test, y_pred_log)
report = classification_report(y_test, y_pred_log)

print(f"Acurácia: {acc:.4f}")
print("Matriz de Confusão:")
print(cm)
print("\nRelatório de Classificação:")
print(report)

Acurácia: 0.7985
Matriz de Confusão:
[[140  25]
 [ 29  74]]

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       165
           1       0.75      0.72      0.73       103

    accuracy                           0.80       268
   macro avg       0.79      0.78      0.79       268
weighted avg       0.80      0.80      0.80       268



# Logando o modelo

In [24]:
# Vamos logar a Random Forest

In [25]:
# Pipeline: scaler + modelo
n_estimators = 100
max_depth = 5

pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    ))
])

In [26]:
with mlflow.start_run():

    # Treinar pipeline
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Avaliar
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)

    # Logar parâmetros
    mlflow.log_param("model_type", "RandomForestPipeline")
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    # Logar métricas
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", report["weighted avg"]["precision"])
    mlflow.log_metric("recall", report["weighted avg"]["recall"])
    mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])

    # Logar pipeline completo
    
    
    signature = infer_signature(X_test, y_pred)
    
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        name="titanic_model",
        signature=signature
    )

    print("Pipeline completo logado no MLflow com sucesso!")

🏃 View run unequaled-fowl-168 at: http://localhost:5000/#/experiments/525494945829811883/runs/2914cb9baf7b404c9fc7392b9705f9a0
🧪 View experiment at: http://localhost:5000/#/experiments/525494945829811883


PermissionError: [Errno 13] Permission denied: '/mlflow'

In [2]:
import mlflow
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("test_minio")

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = LogisticRegression()
model.fit(X_train, y_train)

with mlflow.start_run():
    mlflow.sklearn.log_model(model, name="iris_model")
    print("Modelo logado com sucesso!")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


🏃 View run awesome-calf-823 at: http://localhost:5000/#/experiments/902448251954090682/runs/3806ac79dbae47a8974312b0506d72da
🧪 View experiment at: http://localhost:5000/#/experiments/902448251954090682


PermissionError: [Errno 13] Permission denied: '/mlflow'