In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import joblib
import os

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier  # Bônus se quiser testar também
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
bucket_path = 'gs://predictive-maintenance-leopoldo/manutpred.csv'
df_raw = pd.read_csv(bucket_path, storage_options={'token': 'cloud'})

In [4]:
df = df_raw.copy()
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [5]:
drop_columns = ["TWF", "HDF", "PWF", "OSF", "RNF"]
df.drop(drop_columns, axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure
0,1,M14860,M,298.1,308.6,1551,42.8,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0


In [7]:
# divisão das colunas
categorical_columns = ['Type']
numerical_columns = ['Air temperature [K]','Process temperature [K]','Rotational speed [rpm]','Torque [Nm]','Tool wear [min]']
target_column = ['Machine failure']

# colunas que não importam para o modelo
Xdrop_columns = ['UDI', 'Product ID'] 

# Separando features e target
X = df.drop(target_column, axis=1)
X.drop(Xdrop_columns, axis = 1, inplace =True)
y = df[target_column]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
X_train.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
4058,M,302.0,310.9,1456,47.2,54
1221,M,297.0,308.3,1399,46.4,132
6895,M,301.0,311.6,1357,45.6,137
9863,L,298.9,309.8,1411,56.3,84
8711,L,297.1,308.5,1733,28.7,50


In [9]:
y_test.head()

Unnamed: 0,Machine failure
2997,0
4871,0
3858,0
951,0
6463,0


In [10]:
# CATEGORICAL --> criando o encoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

# encoder + df de treino
encoded_type_train = encoder.fit_transform(X_train[categorical_columns])
encoded_train_df = pd.DataFrame(
    encoded_type_train,
    columns=encoder.get_feature_names_out(input_features=categorical_columns),
    index=X_train.index
)

In [11]:
# CATEGORICAL --> encoder + df de teste
encoded_type_test = encoder.transform(X_test[categorical_columns])

encoded_test_df = pd.DataFrame(
    encoded_type_test,
    columns=encoder.get_feature_names_out(input_features=categorical_columns),
    index=X_test.index
)

In [12]:
# NUMERICAL --> Scaler
#scaler = StandardScaler()
scaler = MinMaxScaler()
#scaler = RobustScaler()
X_train_num = scaler.fit_transform(X_train[numerical_columns])
X_test_num = scaler.transform(X_test[numerical_columns])

In [13]:

X_train_processed = pd.DataFrame(
    np.hstack([X_train_num, encoded_train_df]),
    columns=numerical_columns + list(encoder.get_feature_names_out(categorical_columns)),
    index=X_train.index
)

In [14]:
X_test_processed = pd.DataFrame(
    np.hstack([X_test_num, encoded_test_df]),
    columns=numerical_columns + list(encoder.get_feature_names_out(categorical_columns)),
    index=X_test.index
)

In [15]:
X_train_processed.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_L,Type_M
4058,0.728261,0.641975,0.167637,0.596154,0.213439,0.0,1.0
1221,0.184783,0.320988,0.134459,0.585165,0.521739,0.0,1.0
6895,0.619565,0.728395,0.110012,0.574176,0.541502,0.0,1.0
9863,0.391304,0.506173,0.141444,0.721154,0.332016,1.0,0.0
8711,0.195652,0.345679,0.328871,0.342033,0.197628,1.0,0.0


In [16]:
def clean_column_names(df):
    df.columns = [
        col.replace(' ', '_')
           .replace('[','')
           .replace(']','')
           .replace('(','')
           .replace(')','')
           .replace('/','_per_')  # Caso tenha barras ou outros símbolos no futuro
        for col in df.columns
    ]
    return df

# Aplicando no treino e teste
X_train_processed = clean_column_names(X_train_processed.copy())
X_test_processed = clean_column_names(X_test_processed.copy())

In [17]:
X_test_processed.head()

Unnamed: 0,Air_temperature_K,Process_temperature_K,Rotational_speed_rpm,Torque_Nm,Tool_wear_min,Type_L,Type_M
2997,0.565217,0.506173,0.103027,0.809066,0.604743,1.0,0.0
4871,0.913043,0.82716,0.200815,0.498626,0.533597,1.0,0.0
3858,0.782609,0.703704,0.22759,0.464286,0.826087,1.0,0.0
951,0.032609,0.074074,0.198487,0.43956,0.237154,0.0,0.0
6463,0.565217,0.530864,0.110594,0.777473,0.403162,0.0,0.0


In [18]:
# Aplica SMOTE apenas no conjunto de treino
smote = SMOTE(random_state=42)
X_train_processed, y_train = smote.fit_resample(X_train_processed, y_train)

# (opcional) checar se balanceou
print("Distribuição após SMOTE:")
print(y_train.value_counts())

Distribuição após SMOTE:
Machine failure
0                  7729
1                  7729
Name: count, dtype: int64


In [19]:


models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
}


# Scorers personalizados
scoring = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score, pos_label=1),
    'precision': make_scorer(precision_score, pos_label=1),
    'recall': make_scorer(recall_score, pos_label=1)
}

#mlflow.set_experiment("manutencao_preditiva_multimodel_cv_v2")

mlflow.set_experiment("manutencao_preditiva_multimodel_cv_v2")

# Inicializa variável para controlar o melhor modelo
best_f1 = -1
best_model_uri = ""
best_model_name = ""

for model_name, model in models.items():
    with mlflow.start_run(run_name=f"{model_name}_CV") as run:

        # Cross-Validation com 5 folds
        results = cross_validate(
            model,
            X_train_processed,
            y_train.values.ravel(),
            cv=5,
            scoring=scoring,
            return_train_score=False
        )

        # Métricas médias
        mean_f1 = results['test_f1'].mean()
        mean_acc = results['test_accuracy'].mean()
        mean_prec = results['test_precision'].mean()
        mean_rec = results['test_recall'].mean()

        # Log de métricas e params
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("cv_accuracy", mean_acc)
        mlflow.log_metric("cv_f1_score", mean_f1)
        mlflow.log_metric("cv_precision_class_1", mean_prec)
        mlflow.log_metric("cv_recall_class_1", mean_rec)

        # Treina no full train e salva modelo
        model.fit(X_train_processed, y_train.values.ravel())
        mlflow.sklearn.log_model(model, model_name)

        # Salva encoder
        encoder_path = "encoder.pkl"
        joblib.dump(encoder, encoder_path)
        mlflow.log_artifact(encoder_path, artifact_path="preprocessing")
        os.remove(encoder_path)

        # Salva scaler
        scaler_path = "scaler.pkl"
        joblib.dump(scaler, scaler_path)
        mlflow.log_artifact(scaler_path, artifact_path="preprocessing")
        os.remove(scaler_path)

        # Tags
        mlflow.set_tag("preprocessing_scaler", scaler.__class__.__name__)
        mlflow.set_tag("preprocessing_encoder", encoder.__class__.__name__)

        print(f"{model_name} → CV F1: {mean_f1:.4f} | CV Acc: {mean_acc:.4f}")

        # Atualiza melhor modelo
        if mean_f1 > best_f1:
            best_f1 = mean_f1
            best_model_uri = f"runs:/{run.info.run_id}/{model_name}"
            best_model_name = model_name

# Registrar o melhor modelo no Registry
mlflow.register_model(
    model_uri=best_model_uri,
    name="manutencao_modelo_final"
)
print(f"✅ Modelo '{best_model_name}' registrado com F1={best_f1:.4f}")




RandomForest → CV F1: 0.9779 | CV Acc: 0.9777




LogisticRegression → CV F1: 0.8221 | CV Acc: 0.8224




GradientBoosting → CV F1: 0.9441 | CV Acc: 0.9432




SVM → CV F1: 0.9214 | CV Acc: 0.9193




KNN → CV F1: 0.9600 | CV Acc: 0.9586
[LightGBM] [Info] Number of positive: 6183, number of negative: 6183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002692 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1278
[LightGBM] [Info] Number of data points in the train set: 12366, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 6183, number of negative: 6183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1277
[LightGBM] [Info] Number of data points in the train set: 12366, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 6183, number of negative: 6183
[LightGBM] [Info] Auto



[LightGBM] [Info] Number of positive: 7729, number of negative: 7729
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1279
[LightGBM] [Info] Number of data points in the train set: 15458, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




LightGBM → CV F1: 0.9777 | CV Acc: 0.9776


Successfully registered model 'manutencao_modelo_final'.


XGBoost → CV F1: 0.9808 | CV Acc: 0.9807
✅ Modelo 'XGBoost' registrado com F1=0.9808


Created version '1' of model 'manutencao_modelo_final'.


![XGBoost Imagem](xgboost.png)