In [None]:
# librerías auxiliares
import numpy as np
import pandas as pd

# modulos de sklearn
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_class_weight

# metricas de desempeño
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

Importamos los datasets

In [None]:
data_estimador1 = pd.read_csv("dataset_clasificador_binario.csv")
data_estimador2 = pd.read_csv("dataset_clasificador_multiclase.csv")

# Desempeño final de los mejores modelos de clasificación binario en el conjunto de test sin reducción de la dimensión

La siguiente tabla muestra la mejor combinación de hiperparametros encontrados en la fase de validación para los modelos.

|                        | Hiperparametros |
|------------------------|-----------------|
| SVC                    | kernel = 'linear', gamma = 0.01, C = 0.001                |
| Gradient Boosting tree | n_estimators=5, min_samples_split=4                |
| MLP Classifier | hidden_layers=3, neurons=16|


In [None]:
def evaluar_modelo(estimador, X, Y, dataframe, idx):
  # particionamos el dataset
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
  # balanceamos las muestras de entrenamiento
  nm = RandomUnderSampler(sampling_strategy=0.5)
  X_train, y_train = nm.fit_resample(X_train, y_train)
  # calculamos los pesos para cada clase
  class_weight = compute_class_weight(class_weight = "balanced",
                              classes = np.unique(y_train),
                              y = y_train )
  y_test_pred = 0
  y_test_pred_prob = 0

  ### Definimos los modelos con los mejores hiperparametros encontrados en las fases de validación para cada modelo de clasificación
  if estimador=='svc':
    svm = SVC(kernel = 'linear', gamma = 0.01, C = 0.001, probability=True, class_weight = {0:class_weight[0], 1:class_weight[1]})
    svm.fit(X_train, y_train)
    y_test_pred = svm.predict(X=X_test)
    y_test_pred_prob = svm.predict_proba(X_test)[:, 1]
  elif estimador == 'gradient boosting tree':
    gbt = GradientBoostingClassifier(n_estimators=5, min_samples_split=4)
    weight = []
    for i in y_train:
      weight.append(class_weight[i])
    gbt.fit(X_train, y_train, weight)
    y_test_pred = gbt.predict(X=X_test)
    y_test_pred_prob = gbt.predict_proba(X_test)[:, 1]
  elif estimador == 'red neuronal':
    hidden_layer_sizes = tuple(3*[16])
    mlp = MLPClassifier(activation='logistic',solver='sgd', learning_rate_init = 1, learning_rate = 'constant', batch_size=50, hidden_layer_sizes=hidden_layer_sizes, random_state=1,n_iter_no_change =500, max_iter=2000)
    mlp.fit(X=X_train, y=y_train)
    y_test_pred = mlp.predict(X=X_test)
    y_test_pred_prob = mlp.predict_proba(X_test)[:, 1]

  
  # validamos el modelo
  dataframe.loc[idx,'error de test (recall)'] = recall_score(y_true = y_test, y_pred = y_test_pred)
  dataframe.loc[idx,'error de test (auc)'] = roc_auc_score(y_true=y_test, y_score=y_test_pred_prob)
  print(idx, estimador)
  dataframe.rename(index={idx: estimador}, inplace=True)

Entrenamos y evaluamos cada modelo

In [None]:
df = pd.DataFrame()
x_estimador1 = data_estimador1.drop(['Target'], axis=1).values
y_estimador1 = y_estimator1 = data_estimador1['Target'].values
for i, model in enumerate(['svc', 'gradient boosting tree', 'red neuronal']):
  evaluar_modelo(model, x_estimador1, y_estimador1, df, i)

0 svc
1 gradient boosting tree
2 red neuronal


In [None]:
df

Unnamed: 0,error de test (recall),error de test (auc)
svc,0.838384,0.831982
gradient boosting tree,0.838384,0.944137
red neuronal,0.868687,0.972588


# Desempeño final de los mejores modelos de clasificación multiclase en el conjunto de test sin reducción de la dimensión

La siguiente tabla muestra la mejor combinación de hiperparametros encontrados en la fase de validación para los modelos.

|                        | Hiperparametros |
|------------------------|-----------------|
| SVC                    | kernel = 'linear', gamma = 0.01, C =10                |
| Gradient Boosting tree | n_estimators=50, min_samples_split=4                |
| MLP Classifier | hidden_layers=1, neurons=20|


In [None]:
def evaluar_modelo_multiclase(estimador, X, Y, dataframe, idx):
  # particionamos el dataset
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
  # balanceamos las muestras de entrenamiento
  smote = SMOTE(k_neighbors=3) 
  X_train, y_train = smote.fit_resample(X_train, y_train)
  y_test_pred = 0
  y_test_pred_prob = 0
  ### Definimos los modelos con los mejores hiperparametros encontrados en las fases de validación para cada modelo de clasificación
  if estimador=='svc':
    svm = SVC(kernel = 'linear', gamma = 0.01, C = 10, probability=True)
    svm = OneVsRestClassifier(svm) 
    svm.fit(X_train, y_train)
    y_test_pred = svm.predict(X=X_test)
    y_test_pred_prob = svm.predict_proba(X_test)
  elif estimador == 'gradient boosting tree':
    gbt = GradientBoostingClassifier(n_estimators=50, min_samples_split=4)
    gbt = OneVsRestClassifier(gbt) 
    gbt.fit(X_train, y_train)
    y_test_pred = gbt.predict(X=X_test)
    y_test_pred_prob = gbt.predict_proba(X_test)
  elif estimador == 'red neuronal':
    hidden_layer_sizes = tuple(3*[16])
    mlp = MLPClassifier(activation='logistic',solver='sgd', learning_rate_init = 1, learning_rate = 'constant', batch_size=50, hidden_layer_sizes=hidden_layer_sizes, random_state=1,n_iter_no_change =500, max_iter=2000)
    mlp = OneVsRestClassifier(mlp) 
    mlp.fit(X=X_train, y=y_train)
    y_test_pred = mlp.predict(X=X_test)
    y_test_pred_prob = mlp.predict_proba(X_test)

  # validamos el modelo
  dataframe.loc[idx,'error de test (recall)'] = recall_score(y_true = y_test, y_pred = y_test_pred, average='weighted')
  dataframe.loc[idx,'error de test (auc)'] = roc_auc_score(y_true=y_test, y_score=y_test_pred_prob, multi_class='ovr')
  print(idx, estimador)
  dataframe.rename(index={idx: estimador}, inplace=True)

Entrenamos y evaluamos cada modelo

In [None]:
df_ovr = pd.DataFrame()
x_estimador2 = data_estimador2.drop(['Failure Type'], axis=1).values
y_estimador2 = data_estimador2['Failure Type'].values
for i, model in enumerate(['svc', 'gradient boosting tree', 'red neuronal']):
  evaluar_modelo_multiclase(model, x_estimador2, y_estimador2, df_ovr, i)

0 svc
1 gradient boosting tree
2 red neuronal


In [None]:
df_ovr

Unnamed: 0,error de test (recall),error de test (auc)
svc,0.929293,0.984854
gradient boosting tree,0.858586,0.953144
red neuronal,0.919192,0.978778
