In [1]:
# librerías auxiliares
import numpy as np
import pandas as pd

# modulos de sklearn
from sklearn.model_selection import StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_class_weight

# metricas de desempeño
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

Importamos los datasets

In [11]:
data_estimador1 = pd.read_csv("dataset_clasificador_binario.csv")
data_estimador2 = pd.read_csv("dataset_clasificador_multiclase.csv")

# Validación de los mejores modelos de clasificación binario realizando selección de características

La siguiente tabla muestra la mejor combinación de hiperparametros encontrados en la fase de validación para los modelos.

|                        | Hiperparametros |
|------------------------|-----------------|
| SVC                    | kernel = 'linear', gamma = 0.01, C = 0.001                |
| Gradient Boosting tree | n_estimators=5, min_samples_split=4                |


In [None]:
def evaluar_modelo_SFS(estimador, X, Y, num_features, tipo_busqueda, dataframe, columns_name):

  kf = StratifiedKFold(n_splits=2)
  idx = 0
  for n_feature in num_features:
    print("Tipo busqueda", tipo_busqueda, " # feature", n_feature)
    errores_test_recall = []
    errores_test_auc = []
    count = 0
    for train_index, test_index in kf.split(X,Y):
      print("split", count)
      X_train, X_test = X[train_index], X[test_index]
      y_train, y_test = Y[train_index], Y[test_index]
      # balanceamos las muestras de entrenamiento
      nm = RandomUnderSampler(sampling_strategy=0.5)
      X_train, y_train = nm.fit_resample(X_train, y_train)
      # calculamos los pesos para cada clase
      class_weight = compute_class_weight(class_weight = "balanced",
                                  classes = np.unique(y_train),
                                  y = y_train )
      
      y_test_pred = 0
      y_test_pred_prob = 0

      ### Definimos los modelos con los mejores hiperparametros encontrados en las fases de validación para cada modelo de clasificación
      if estimador=='svc':
        svm = SVC(kernel = 'linear', gamma = 0.01, C = 0.001, probability=True, class_weight = {0:class_weight[0], 1:class_weight[1]})
        sfs = SequentialFeatureSelector(svm, n_features_to_select=n_feature, direction=tipo_busqueda, scoring='recall', cv=5)
        sfs.fit(X_train, y_train)
        print(columns_name[sfs.get_support()])
        X_train = sfs.transform(X_train)
        X_test = sfs.transform(X_test)
        svm.fit(X_train, y_train)
        y_test_pred = svm.predict(X=X_test)
        y_test_pred_prob = svm.predict_proba(X_test)[:, 1]
      elif estimador == 'gradient boosting tree':
        gbt = GradientBoostingClassifier(n_estimators=5, min_samples_split=4)
        sfs = SequentialFeatureSelector(gbt, n_features_to_select=n_feature, direction=tipo_busqueda, scoring='recall', cv=5)
        sfs.fit(X_train, y_train)
        print(columns_name[sfs.get_support()])
        X_train = sfs.transform(X_train)
        X_test = sfs.transform(X_test)
        weight = []
        for i in y_train:
          weight.append(class_weight[i])
        gbt.fit(X_train, y_train, weight)
        y_test_pred = gbt.predict(X=X_test)
        y_test_pred_prob = gbt.predict_proba(X_test)[:, 1]
      elif estimador == 'red neuronal':
        hidden_layer_sizes = tuple(3*[16])
        mlp = MLPClassifier(activation='logistic',solver='sgd', learning_rate_init = 1, learning_rate = 'constant', batch_size=50, hidden_layer_sizes=hidden_layer_sizes, random_state=1,n_iter_no_change =500, max_iter=2000)
        sfs = SequentialFeatureSelector(mlp, n_features_to_select=n_feature, direction=tipo_busqueda, scoring='recall', cv=5)
        sfs.fit(X_train, y_train)
        print(columns_name[sfs.get_support()])
        X_train = sfs.transform(X_train)
        X_test = sfs.transform(X_test)
        mlp.fit(X=X_train, y=y_train)
        y_test_pred = mlp.predict(X=X_test)
        y_test_pred_prob = mlp.predict_proba(X_test)[:, 1]

      #Evaluamos las predicciones del modelo con los datos de  test
      # calculo del recall
      errores_test_recall.append(recall_score(y_true = y_test, y_pred = y_test_pred))
      # calculo del area bajo la curva roc
      errores_test_auc.append(roc_auc_score(y_true=y_test, y_score=y_test_pred_prob))
      count = count +1
    # validamos el modelo
    dataframe.loc[idx,'direccion'] = tipo_busqueda
    dataframe.loc[idx,'# caracteristicas'] = n_feature
    dataframe.loc[idx,'error de test (recall)'] = np.mean(errores_test_recall)
    dataframe.loc[idx,'error de test (auc)'] = np.mean(errores_test_auc)
    idx= idx +1

In [12]:
x_estimador1 = data_estimador1.drop(['Target'], axis=1).values
y_estimador1 = y_estimator1 = data_estimador1['Target'].values

## Validamos la maquina de soporte vectorial junto con el algoritmo SFS buscando cuales serán las caracteristicas mas optimas para entrenar el modelo

In [None]:
df = pd.DataFrame()
caracteristicas = [3,4,5,6,7]
busqueda = 'forward'
evaluar_modelo_SFS('svc', x_estimador1, y_estimador1, caracteristicas, busqueda, df, data_estimador1.drop(['Target'], axis=1).columns.values)

Tipo busqueda forward  # feature 3
split 0
['Rotational speed [rpm]' 'Torque [Nm]' 'H']
split 1
['Rotational speed [rpm]' 'Torque [Nm]' 'M']
split 2
['Torque [Nm]' 'Tool wear [min]' 'H']
split 3
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]']
split 4
['Rotational speed [rpm]' 'Torque [Nm]' 'H']
Tipo busqueda forward  # feature 4
split 0
['Rotational speed [rpm]' 'Torque [Nm]' 'H' 'L']
split 1
['Rotational speed [rpm]' 'Torque [Nm]' 'L' 'M']
split 2
['Rotational speed [rpm]' 'Torque [Nm]' 'H' 'M']
split 3
['Rotational speed [rpm]' 'Torque [Nm]' 'H' 'L']
split 4
['Rotational speed [rpm]' 'Torque [Nm]' 'L' 'M']
Tipo busqueda forward  # feature 5
split 0
['Process temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]' 'H' 'L']
split 1
['Process temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]' 'H' 'L']
split 2
['Rotational speed [rpm]' 'Torque [Nm]' 'H' 'L' 'M']
split 3
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]' 'H' 'L']
split 4
['Process temperatur

Mostramos los 5 mejores resultados despues de realizar la validación

In [None]:
df.sort_values('error de test (recall)', ascending=False).head(5)

Unnamed: 0,direccion,# caracteristicas,error de test (recall),error de test (auc)
2,forward,5.0,0.778788,0.787136
1,forward,4.0,0.763636,0.77808
3,forward,6.0,0.760606,0.768845
0,forward,3.0,0.757576,0.770763
4,forward,7.0,0.736364,0.785826


## Validamos el gradient boosting tree junto con el algoritmo SFS buscando cuales serán las caracteristicas mas optimas para entrenar el modelo

In [None]:
df_tree = pd.DataFrame()
caracteristicas = [3,4,5,6,7]
busqueda = 'forward' 
evaluar_modelo_SFS('gradient boosting tree', x_estimador1, y_estimador1, caracteristicas, busqueda, df_tree, data_estimador1.drop(['Target'], axis=1).columns.values)

Tipo busqueda forward  # feature 3
split 0
['Rotational speed [rpm]' 'H' 'M']
split 1
['Torque [Nm]' 'Tool wear [min]' 'L']
split 2
['Torque [Nm]' 'Tool wear [min]' 'H']
split 3
['Process temperature [K]' 'Rotational speed [rpm]' 'Tool wear [min]']
split 4
['Process temperature [K]' 'Rotational speed [rpm]' 'L']
Tipo busqueda forward  # feature 4
split 0
['Torque [Nm]' 'Tool wear [min]' 'H' 'M']
split 1
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]' 'H']
split 2
['Torque [Nm]' 'Tool wear [min]' 'H' 'L']
split 3
['Process temperature [K]' 'Rotational speed [rpm]' 'Tool wear [min]' 'H']
split 4
['Torque [Nm]' 'Tool wear [min]' 'H' 'M']
Tipo busqueda forward  # feature 5
split 0
['Process temperature [K]' 'Rotational speed [rpm]' 'H' 'L' 'M']
split 1
['Torque [Nm]' 'Tool wear [min]' 'H' 'L' 'M']
split 2
['Torque [Nm]' 'Tool wear [min]' 'H' 'L' 'M']
split 3
['Torque [Nm]' 'Tool wear [min]' 'H' 'L' 'M']
split 4
['Process temperature [K]' 'Rotational speed [rpm]' 'H' 'L' 'M']
Tipo

Mostramos los 5 mejores resultados despues de realizar la validación

In [None]:
df_tree.sort_values('error de test (recall)', ascending=False).head(5)

Unnamed: 0,direccion,# caracteristicas,error de test (recall),error de test (auc)
1,forward,4.0,0.79697,0.878513
4,forward,7.0,0.727273,0.897301
3,forward,6.0,0.69697,0.884789
0,forward,3.0,0.672727,0.804014
2,forward,5.0,0.633333,0.83616


## Desempeño final de los mejores modelos de clasificación binaria en el conjunto de test, tras haber realizado selección de caracteristicas 

In [8]:
svc = SVC(kernel = 'linear', gamma = 0.01, C = 0.001, probability=True)
gbt = GradientBoostingClassifier(n_estimators=5, min_samples_split=4)

In [15]:
def evaluar_modelo(data_tuple, X, Y, dataframe, idx, nombre_modelo, columns_name):
  print(idx, nombre_modelo)
  estimador, n_feature = data_tuple
  # particionamos el dataset
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
  # balanceamos las muestras de entrenamiento
  rus = RandomUnderSampler()
  X_train, y_train = rus.fit_resample(X_train, y_train)
  # aplicamos la función wrapper para generar modelos por clase
  estimador = OneVsRestClassifier(estimador) 

  sfs = SequentialFeatureSelector(estimador, n_features_to_select=n_feature, direction='forward', scoring='recall', cv=5)
  sfs.fit(X_train, y_train)
  print(columns_name[sfs.get_support()])
  X_train = sfs.transform(X_train)
  X_test = sfs.transform(X_test)

  # entrenamos el modelo
  estimador.fit(X_train, y_train)
  # validamos el modelo
  dataframe.loc[idx,'# caracteristicas'] = n_feature
  dataframe.loc[idx, 'tipo de seleccion'] = 'forward'
  dataframe.loc[idx,'caracteristicas seleccionadas'] = ", ".join(columns_name[sfs.get_support()])
  dataframe.loc[idx,'error de test (recall)'] = recall_score(y_true = y_test, y_pred = estimador.predict(X_test))
  dataframe.loc[idx,'error de test (auc)'] = roc_auc_score(y_true=y_test, y_score=estimador.predict_proba(X_test)[:, 1])
  dataframe.rename(index={idx: nombre_modelo}, inplace=True)

In [16]:
result = pd.DataFrame()
t = [(svc, 5), (gbt, 4)]
for i, model in enumerate(t):
  evaluar_modelo(model, x_estimador1, y_estimador1, result, i, model[0].__class__.__name__, data_estimador1.drop(['Target'], axis=1).columns.values)

0 SVC
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]' 'H']
1 GradientBoostingClassifier
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]']


In [17]:
result

Unnamed: 0,# caracteristicas,tipo de seleccion,caracteristicas seleccionadas,error de test (recall),error de test (auc)
SVC,5.0,forward,"Air temperature [K], Rotational speed [rpm], T...",0.878788,0.833239
GradientBoostingClassifier,4.0,forward,"Air temperature [K], Rotational speed [rpm], T...",0.949495,0.942088


# Validación de los mejores modelos de clasificación multiclase realizando selección de características

La siguiente tabla muestra la mejor combinación de hiperparametros encontrados en la fase de validación para los modelos.

|                        | Hiperparametros |
|------------------------|-----------------|
| SVC                    | kernel = 'linear', gamma = 0.01, C =10                |
| Gradient Boosting tree | n_estimators=50, min_samples_split=4                |

In [18]:
def evaluar_modelo_SFS_multiclase(estimador, X, Y, num_features, tipo_busqueda, dataframe, columns_name):

  kf = StratifiedKFold(n_splits=5)
  idx = 0
  for direccion in tipo_busqueda:
    for n_feature in num_features:
      print("Tipo busqueda", direccion, " # feature", n_feature)
      errores_test_recall = []
      errores_test_auc = []
      errores_test_f1 = []
      count = 0
      for train_index, test_index in kf.split(X,Y):
        print("split", count)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        # balanceamos las muestras de entrenamiento
        smote = SMOTE(k_neighbors=3) 
        X_train, y_train = smote.fit_resample(X_train, y_train)

        estimador = OneVsRestClassifier(estimador) 

        sfs = SequentialFeatureSelector(estimador, n_features_to_select=n_feature, direction=direccion, scoring='roc_auc_ovr', cv=5)
        sfs.fit(X_train, y_train)
        print(columns_name[sfs.get_support()])
        X_train = sfs.transform(X_train)
        X_test = sfs.transform(X_test)
        estimador.fit(X_train, y_train)

        # prediccion del modelo
        y_test_pred = estimador.predict(X=X_test)

        #Evaluamos las predicciones del modelo con los datos de  test
        # calculo del recall
        errores_test_recall.append(recall_score(y_true = y_test, y_pred = y_test_pred, average='weighted'))
        # calculo del area bajo la curva roc
        errores_test_auc.append(roc_auc_score(y_true=y_test, y_score=estimador.predict_proba(X_test), multi_class='ovr'))
        count = count +1
      # validamos el modelo
      dataframe.loc[idx,'direccion'] = direccion
      dataframe.loc[idx,'# caracteristicas'] = n_feature
      dataframe.loc[idx,'error de test (recall)'] = np.mean(errores_test_recall)
      dataframe.loc[idx,'error de test (auc)'] = np.mean(errores_test_auc)
      idx= idx +1

Definimos los modelos con los mejores hiperparametros encontrados en las fases de validación para cada modelo de clasificación

In [20]:
x_estimador2 = data_estimador2.drop(['Failure Type'], axis=1).values
y_estimador2 = data_estimador2['Failure Type'].values

svc_ovr = SVC(kernel = 'linear', gamma = 0.01, C = 10, probability=True)
gradient_boosting_tree_ovr = GradientBoostingClassifier(n_estimators=50, min_samples_split=4)

## Validamos la maquina de soporte vectorial junto con el algoritmo SFS buscando cuales serán las caracteristicas mas optimas para entrenar el modelo

In [21]:
df_ovr = pd.DataFrame()
caracteristicas_ovr = [3,4,5,6,7]
busqueda_ovr = ['forward']
#busqueda_ovr = ['forward']
evaluar_modelo_SFS_multiclase(svc_ovr, x_estimador2, y_estimador2, caracteristicas_ovr, busqueda_ovr, df_ovr, data_estimador2.drop(['Failure Type'], axis=1).columns.values)

Tipo busqueda forward  # feature 3
split 0
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]']
split 1
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]']
split 2
['Air temperature [K]' 'Torque [Nm]' 'Tool wear [min]']
split 3
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]']
split 4
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]']
Tipo busqueda forward  # feature 4
split 0
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]']
split 1
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]']
split 2
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]']
split 3
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]']
split 4
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]']
Tipo busqueda forward  # feature 5
split 0
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]' 'M']
split 1
['Air tempe

Mostramos los 5 mejores resultados despues de realizar la validación

In [22]:
df_ovr.sort_values('error de test (recall)', ascending=False).head(5)

Unnamed: 0,direccion,# caracteristicas,error de test (recall),error de test (auc),error de test (f1)
4,forward,7.0,0.915152,0.992407,0.915374
2,forward,5.0,0.909091,0.986885,0.909737
1,forward,4.0,0.906061,0.984914,0.907246
3,forward,6.0,0.9,0.992714,0.901
0,forward,3.0,0.818182,0.952185,0.810545


## Validamos el gradient boosting tree junto con el algoritmo SFS buscando cuales serán las caracteristicas mas optimas para entrenar el modelo

In [23]:
df_ovr_tree = pd.DataFrame()
caracteristicas_ovr = [3,4,5,6,7]
busqueda_ovr = ['forward' ]
evaluar_modelo_SFS_multiclase(gradient_boosting_tree_ovr, x_estimador2, y_estimador2, caracteristicas_ovr, busqueda_ovr, df_ovr_tree, data_estimador2.drop(['Failure Type'], axis=1).columns.values)

Tipo busqueda forward  # feature 3
split 0
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]']
split 1
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]']
split 2
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]']
split 3
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]']
split 4
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]']
Tipo busqueda forward  # feature 4
split 0
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]' 'L']
split 1
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]']
split 2
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]']
split 3
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]' 'M']
split 4
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]' 'L']
Tipo busqueda forward  # feature 5
split 0
['Rotational speed [rpm]' 'Torque [Nm]' 'Tool wear [min]' 'L' 'M']
split 1
['Air temperature [K]' 'Rotational speed [rpm]' 'Torque [Nm]'
 'Tool wear [min]'

Mostramos los 5 mejores resultados despues de realizar la validación

In [24]:
df_ovr_tree.sort_values('error de test (recall)', ascending=False).head(5)

Unnamed: 0,direccion,# caracteristicas,error de test (recall),error de test (auc),error de test (f1)
4,forward,7.0,0.884848,0.983386,0.880566
3,forward,6.0,0.878788,0.980067,0.873409
0,forward,3.0,0.875758,0.963484,0.873013
2,forward,5.0,0.875758,0.982714,0.871654
1,forward,4.0,0.866667,0.977555,0.863229


## Desempeño final de los mejores modelos de clasificación multiclase en el conjunto de test, tras haber realizado selección de caracteristicas 

In [25]:
def evaluar_modelo_multiclase(data_tuple, X, Y, dataframe, idx, nombre_modelo, columns_name):
  print(idx, nombre_modelo)
  estimador, n_feature = data_tuple
  # particionamos el dataset
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
  # balanceamos las muestras de entrenamiento
  smote = SMOTE(k_neighbors=3) 
  X_train, y_train = smote.fit_resample(X_train, y_train)
  # aplicamos la función wrapper para generar modelos por clase
  estimador = OneVsRestClassifier(estimador) 

  sfs = SequentialFeatureSelector(estimador, n_features_to_select=n_feature, direction='forward', scoring='roc_auc_ovr', cv=5)
  sfs.fit(X_train, y_train)
  print(columns_name[sfs.get_support()])
  X_train = sfs.transform(X_train)
  X_test = sfs.transform(X_test)

  # entrenamos el modelo
  estimador.fit(X_train, y_train)
  # validamos el modelo
  dataframe.loc[idx,'# caracteristicas'] = n_feature
  dataframe.loc[idx, 'tipo de seleccion'] = 'forward'
  dataframe.loc[idx,'caracteristicas seleccionadas'] = ", ".join(columns_name[sfs.get_support()])
  dataframe.loc[idx,'error de test (recall)'] = recall_score(y_true = y_test, y_pred = estimador.predict(X_test), average='weighted')
  dataframe.loc[idx,'error de test (auc)'] = roc_auc_score(y_true=y_test, y_score=estimador.predict_proba(X_test), multi_class='ovr')
  dataframe.loc[idx,'error de test (f1)'] = f1_score(y_true = y_test, y_pred = estimador.predict(X_test), average='weighted')
  dataframe.rename(index={idx: nombre_modelo}, inplace=True)

In [26]:
result_ovr = pd.DataFrame()
t = [(svc_ovr, 6), (gradient_boosting_tree_ovr, 7)]
for i, model in enumerate(t):
  evaluar_modelo_multiclase(model, x_estimador2, y_estimador2, result_ovr, i, model[0].__class__.__name__, data_estimador2.drop(['Failure Type'], axis=1).columns.values)

0 SVC
['Air temperature [K]' 'Process temperature [K]' 'Rotational speed [rpm]'
 'Torque [Nm]' 'Tool wear [min]' 'M']
1 GradientBoostingClassifier
['Air temperature [K]' 'Process temperature [K]' 'Rotational speed [rpm]'
 'Torque [Nm]' 'Tool wear [min]' 'H' 'M']


In [27]:
result_ovr

Unnamed: 0,# caracteristicas,tipo de seleccion,caracteristicas seleccionadas,error de test (recall),error de test (auc),error de test (f1)
SVC,6.0,forward,"Air temperature [K], Process temperature [K], ...",0.89899,0.983358,0.898848
GradientBoostingClassifier,7.0,forward,"Air temperature [K], Process temperature [K], ...",0.848485,0.956193,0.847862
