In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append("../utils")
import toolbox_ML as tb
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [3]:
df=pd.read_csv('../data_sample/ai4i2020.csv')
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [4]:
# rename dataset columns
df.rename(columns = {'Product ID':'Product_ID',
                    'Air temperature [K]':'Air_temperature',
                     'Process temperature [K]':'Process_temperature',
                     'Rotational speed [rpm]':'Rotational_speed',
                     'Torque [Nm]':'Torque',
                     'Tool wear [min]':'Tool_wear',
                     'Machine failure':'Machine_failure',},
          inplace = True)

In [None]:
df=pd.read_csv('../data_sample/ai4i2020.csv')
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [5]:
# Codificar 'Type'
df['Type'] = df['Type'].map({'L': 1, 'M': 2, 'H': 3})


In [13]:
df.drop(columns=['Product_ID','UDI','TWF','HDF','PWF','OSF','RNF'], axis=1, inplace=True)

In [15]:
# Separar features y target
X = df.drop(columns=['Machine_failure'])
y = df['Machine_failure']

In [16]:
# Aplicar SMOTE para balancear el dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [17]:
# División en train y test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)


In [18]:
# Modelos a evaluar
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'SVM': SVC(class_weight='balanced', probability=True, random_state=42)
}


In [19]:
# Evaluación de modelos
for name, model in models.items():
    print(f'\nEntrenando modelo: {name}')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))



Entrenando modelo: Random Forest
[[1866   67]
 [  33 1899]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1933
           1       0.97      0.98      0.97      1932

    accuracy                           0.97      3865
   macro avg       0.97      0.97      0.97      3865
weighted avg       0.97      0.97      0.97      3865


Entrenando modelo: Logistic Regression
[[1630  303]
 [ 353 1579]]
              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1933
           1       0.84      0.82      0.83      1932

    accuracy                           0.83      3865
   macro avg       0.83      0.83      0.83      3865
weighted avg       0.83      0.83      0.83      3865


Entrenando modelo: SVM
[[1546  387]
 [ 272 1660]]
              precision    recall  f1-score   support

           0       0.85      0.80      0.82      1933
           1       0.81      0.86      0.83      1932

   