In [25]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import joblib

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier  # Bônus se quiser testar também
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
bucket_path = 'gs://predictive-maintenance-leopoldo/manutpred.csv'
df_raw = pd.read_csv(bucket_path, storage_options={'token': 'cloud'})

In [4]:
df = df_raw.copy()
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [5]:
drop_columns = ["TWF", "HDF", "PWF", "OSF", "RNF"]
df.drop(drop_columns, axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure
0,1,M14860,M,298.1,308.6,1551,42.8,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0


In [7]:
# divisão das colunas
categorical_columns = ['Type']
numerical_columns = ['Air temperature [K]','Process temperature [K]','Rotational speed [rpm]','Torque [Nm]','Tool wear [min]']
target_column = ['Machine failure']

# colunas que não importam para o modelo
Xdrop_columns = ['UDI', 'Product ID'] 

# Separando features e target
X = df.drop(target_column, axis=1)
X.drop(Xdrop_columns, axis = 1, inplace =True)
y = df[target_column]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
X_train.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
4058,M,302.0,310.9,1456,47.2,54
1221,M,297.0,308.3,1399,46.4,132
6895,M,301.0,311.6,1357,45.6,137
9863,L,298.9,309.8,1411,56.3,84
8711,L,297.1,308.5,1733,28.7,50


In [9]:
y_test.head()

Unnamed: 0,Machine failure
2997,0
4871,0
3858,0
951,0
6463,0


In [10]:
# CATEGORICAL --> criando o encoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

# encoder + df de treino
encoded_type_train = encoder.fit_transform(X_train[categorical_columns])
encoded_train_df = pd.DataFrame(
    encoded_type_train,
    columns=encoder.get_feature_names_out(input_features=categorical_columns),
    index=X_train.index
)

In [11]:
# CATEGORICAL --> encoder + df de teste
encoded_type_test = encoder.transform(X_test[categorical_columns])

encoded_test_df = pd.DataFrame(
    encoded_type_test,
    columns=encoder.get_feature_names_out(input_features=categorical_columns),
    index=X_test.index
)

In [12]:
# NUMERICAL --> Scaler
scaler = MinMaxScaler()
X_train_num = scaler.fit_transform(X_train[numerical_columns])
X_test_num = scaler.transform(X_test[numerical_columns])

In [13]:

X_train_processed = pd.DataFrame(
    np.hstack([X_train_num, encoded_train_df]),
    columns=numerical_columns + list(encoder.get_feature_names_out(categorical_columns)),
    index=X_train.index
)

In [14]:
X_test_processed = pd.DataFrame(
    np.hstack([X_test_num, encoded_test_df]),
    columns=numerical_columns + list(encoder.get_feature_names_out(categorical_columns)),
    index=X_test.index
)

In [15]:
X_train_processed.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_L,Type_M
4058,0.998914,0.604282,-0.460607,0.718305,-0.843997,0.0,1.0
1221,-1.505194,-1.15326,-0.775574,0.638456,0.382263,0.0,1.0
6895,0.498092,1.077466,-1.007654,0.558607,0.46087,0.0,1.0
9863,-0.553633,-0.139294,-0.709265,1.626586,-0.372359,1.0,0.0
8711,-1.455112,-1.018064,1.070019,-1.128202,-0.906882,1.0,0.0


In [16]:
def clean_column_names(df):
    df.columns = [
        col.replace(' ', '_')
           .replace('[','')
           .replace(']','')
           .replace('(','')
           .replace(')','')
           .replace('/','_per_')  # Caso tenha barras ou outros símbolos no futuro
        for col in df.columns
    ]
    return df

# Aplicando no treino e teste
X_train_processed = clean_column_names(X_train_processed.copy())
X_test_processed = clean_column_names(X_test_processed.copy())

In [17]:
X_test_processed.head()

Unnamed: 0,Air_temperature_K,Process_temperature_K,Rotational_speed_rpm,Torque_Nm,Tool_wear_min,Type_L,Type_M
2997,0.247681,-0.139294,-1.073963,2.265378,0.71241,1.0,0.0
4871,1.85031,1.618248,-0.145641,0.009646,0.429427,1.0,0.0
3858,1.249324,0.94227,0.108543,-0.239882,1.592802,1.0,0.0
951,-2.206344,-2.505215,-0.167744,-0.419543,-0.74967,0.0,0.0
6463,0.247681,-0.004098,-1.002129,2.035812,-0.089376,0.0,0.0


In [18]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train_processed, y_train)

preds = model.predict(X_test_processed)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1932
           1       0.88      0.53      0.66        68

    accuracy                           0.98      2000
   macro avg       0.93      0.76      0.83      2000
weighted avg       0.98      0.98      0.98      2000



In [20]:
"""
mlflow.set_experiment("manutencao_preditiva_multimodel_v2")

models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train_processed, y_train.values.ravel())  # Corrigindo .ravel() caso dê warning

        preds = model.predict(X_test_processed)

        acc = accuracy_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        precision = precision_score(y_test, preds, pos_label=1)
        recall = recall_score(y_test, preds, pos_label=1)

        # Log dos parâmetros
        mlflow.log_param("model_type", model_name)

        # Log das métricas
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("precision_class_1", precision)
        mlflow.log_metric("recall_class_1", recall)

        # Salvar o modelo
        mlflow.sklearn.log_model(model, model_name)

        print(f"Modelo: {model_name} - Acc: {acc:.4f} - F1: {f1:.4f} - Prec_1: {precision:.4f} - Rec_1: {recall:.4f}")
"""

2025/06/25 00:13:11 INFO mlflow.tracking.fluent: Experiment with name 'manutencao_preditiva_multimodel_v2' does not exist. Creating a new experiment.


Modelo: RandomForest - Acc: 0.9815 - F1: 0.6606 - Prec_1: 0.8780 - Rec_1: 0.5294




Modelo: LogisticRegression - Acc: 0.9675 - F1: 0.1772 - Prec_1: 0.6364 - Rec_1: 0.1029




Modelo: GradientBoosting - Acc: 0.9860 - F1: 0.7667 - Prec_1: 0.8846 - Rec_1: 0.6765




Modelo: SVM - Acc: 0.9715 - F1: 0.3133 - Prec_1: 0.8667 - Rec_1: 0.1912




Modelo: KNN - Acc: 0.9735 - F1: 0.4176 - Prec_1: 0.8261 - Rec_1: 0.2794
[LightGBM] [Info] Number of positive: 271, number of negative: 7729
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 928
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.033875 -> initscore=-3.350616
[LightGBM] [Info] Start training from score -3.350616




Modelo: LightGBM - Acc: 0.9875 - F1: 0.7899 - Prec_1: 0.9216 - Rec_1: 0.6912




Modelo: XGBoost - Acc: 0.9885 - F1: 0.8160 - Prec_1: 0.8947 - Rec_1: 0.7500


In [None]:
### primeiro o grid search penas do gradient booosting

In [24]:


models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
}


# Scorers personalizados
scoring = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score, pos_label=1),
    'precision': make_scorer(precision_score, pos_label=1),
    'recall': make_scorer(recall_score, pos_label=1)
}

mlflow.set_experiment("manutencao_preditiva_multimodel_cv_v2")

for model_name, model in models.items():
    with mlflow.start_run(run_name=f"{model_name}_CV"):

        # Cross-Validation com 5 folds
        results = cross_validate(
            model,
            X_train_processed,
            y_train.values.ravel(),
            cv=5,
            scoring=scoring,
            return_train_score=False
        )

        # Log de métricas médias
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("cv_accuracy", results['test_accuracy'].mean())
        mlflow.log_metric("cv_f1_score", results['test_f1'].mean())
        mlflow.log_metric("cv_precision_class_1", results['test_precision'].mean())
        mlflow.log_metric("cv_recall_class_1", results['test_recall'].mean())

        # Treina no full train e salva o modelo final
        model.fit(X_train_processed, y_train.values.ravel())
        mlflow.sklearn.log_model(model, model_name)



        # Salvar o encoder
        encoder_path = f"encoder.pkl"
        joblib.dump(encoder, encoder_path)
        mlflow.log_artifact(encoder_path, artifact_path="preprocessing")


        # Salvar o scaler
        scaler_path = f"scaler.pkl"
        joblib.dump(scaler, scaler_path)
        mlflow.log_artifact(scaler_path, artifact_path="preprocessing")

        # Tags explicativas
        mlflow.set_tag("preprocessing_scaler", scaler.__class__.__name__)
        mlflow.set_tag("preprocessing_encoder", encoder.__class__.__name__)
        
        print(f"{model_name} → CV F1: {results['test_f1'].mean():.4f} | CV Acc: {results['test_accuracy'].mean():.4f}")




RandomForest → CV F1: 0.6584 | CV Acc: 0.9818




LogisticRegression → CV F1: 0.2960 | CV Acc: 0.9704




GradientBoosting → CV F1: 0.7150 | CV Acc: 0.9838




SVM → CV F1: 0.4087 | CV Acc: 0.9741




KNN → CV F1: 0.4391 | CV Acc: 0.9739
[LightGBM] [Info] Number of positive: 217, number of negative: 6183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000431 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 922
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.033906 -> initscore=-3.349662
[LightGBM] [Info] Start training from score -3.349662
[LightGBM] [Info] Number of positive: 217, number of negative: 6183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 922
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.033906 -> initscore=-3.349662
[LightGBM] [Info] Start training from sco



LightGBM → CV F1: 0.7388 | CV Acc: 0.9844




XGBoost → CV F1: 0.7358 | CV Acc: 0.9836
