# Modelos supervisados

## Preprocesamiento y Balanceo de Clases

Transformamos quality_label a valores numéricos, y ajustamos el balance de clases.

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight


In [17]:
# Leeer el vinotinto.csv

vinotinto_df = pd.read_csv('../data/processed/vinotinto.csv')


# Convertir quality_label a valores numéricos
label_encoder = LabelEncoder()
vinotinto_df['quality_label_encoded'] = label_encoder.fit_transform(vinotinto_df['quality_label'])

# Definir las características (X) y la variable objetivo (y)
X = vinotinto_df.drop(['quality', 'quality_label', 'quality_label_encoded'], axis=1)  # Eliminar 'quality' y 'quality_label'
y = vinotinto_df['quality_label_encoded']

# Balanceo de clases: calculamos los pesos de las clases
class_weights = compute_class_weight('balanced', classes=np.array([0, 1, 2]), y=y)
class_weight_dict = dict(enumerate(class_weights))

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [18]:
print(class_weights)

[2.46195652 0.7078125  0.84672897]


In [19]:
print(class_weight_dict)

{0: np.float64(2.4619565217391304), 1: np.float64(0.7078125), 2: np.float64(0.8467289719626169)}


## MODELOS SUPERVISADORS CON BALANCEO DE CLASES 

Modelo 1: Regresión Logística

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Inicializar y entrenar el modelo
logreg = LogisticRegression(class_weight=class_weight_dict, max_iter=1000)
logreg.fit(X_train, y_train)

# Predecir y evaluar
y_pred = logreg.predict(X_test)
print("Regresión Logística:")
print(classification_report(y_test, y_pred))


Regresión Logística:
              precision    recall  f1-score   support

           0       0.44      0.75      0.55        55
           1       0.68      0.72      0.70       192
           2       0.51      0.36      0.42       161

    accuracy                           0.58       408
   macro avg       0.55      0.61      0.56       408
weighted avg       0.58      0.58      0.57       408



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Modelo 2: Árbol de Decisión

In [21]:
from sklearn.tree import DecisionTreeClassifier

# Inicializar y entrenar el modelo
dtree = DecisionTreeClassifier(class_weight=class_weight_dict)
dtree.fit(X_train, y_train)

# Predecir y evaluar
y_pred = dtree.predict(X_test)
print("Árbol de Decisión:")
print(classification_report(y_test, y_pred))


Árbol de Decisión:
              precision    recall  f1-score   support

           0       0.41      0.42      0.41        55
           1       0.61      0.64      0.62       192
           2       0.40      0.38      0.39       161

    accuracy                           0.50       408
   macro avg       0.47      0.48      0.48       408
weighted avg       0.50      0.50      0.50       408



Modelo 3: Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

# Inicializar y entrenar el modelo
rf = RandomForestClassifier(class_weight=class_weight_dict, n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predecir y evaluar
y_pred = rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred))


Random Forest:
              precision    recall  f1-score   support

           0       0.71      0.36      0.48        55
           1       0.67      0.73      0.70       192
           2       0.51      0.55      0.53       161

    accuracy                           0.61       408
   macro avg       0.63      0.55      0.57       408
weighted avg       0.62      0.61      0.61       408



Modelo 4: Support Vector Machine (SVM)

In [23]:
from sklearn.svm import SVC

# Inicializar y entrenar el modelo
svm = SVC(class_weight=class_weight_dict, kernel='linear')
svm.fit(X_train, y_train)

# Predecir y evaluar
y_pred = svm.predict(X_test)
print("Support Vector Machine:")
print(classification_report(y_test, y_pred))


Support Vector Machine:
              precision    recall  f1-score   support

           0       0.44      0.78      0.56        55
           1       0.68      0.74      0.71       192
           2       0.53      0.34      0.41       161

    accuracy                           0.59       408
   macro avg       0.55      0.62      0.56       408
weighted avg       0.59      0.59      0.57       408



Modelo 5: K-Nearest Neighbors (KNN)

In [24]:
from sklearn.neighbors import KNeighborsClassifier

# Inicializar y entrenar el modelo
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predecir y evaluar
y_pred = knn.predict(X_test)
print("K-Nearest Neighbors:")
print(classification_report(y_test, y_pred))


K-Nearest Neighbors:
              precision    recall  f1-score   support

           0       0.37      0.25      0.30        55
           1       0.57      0.74      0.64       192
           2       0.40      0.29      0.34       161

    accuracy                           0.50       408
   macro avg       0.45      0.43      0.43       408
weighted avg       0.47      0.50      0.48       408



## MODELO NO SUPERVISADORS CON BALANCEO DE CLASES 

Modelo No Supervisado (K-Means Clustering)

In [25]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Inicializar y entrenar el modelo K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)

# Evaluar la calidad del clustering
silhouette = silhouette_score(X, kmeans.labels_)
print("Silhouette Score para K-Means:", silhouette)


Silhouette Score para K-Means: 0.5814772034496786


## MODELOS SUPERVISADORS CON SMOTE

SMOTE: Sobremuestreo de clases minoritarias generando ejemplos sintéticos.

In [26]:
#pip install imbalanced-learn

In [27]:
from imblearn.over_sampling import SMOTE
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Leer el vinotinto.csv
vinotinto_df = pd.read_csv('../data/processed/vinotinto.csv')

# Crear la columna 'quality_label_encoded' si no existe
label_encoder = LabelEncoder()
vinotinto_df['quality_label_encoded'] = label_encoder.fit_transform(vinotinto_df['quality_label'])

# Definir las características (X) y la variable objetivo (y)
X = vinotinto_df.drop(['quality', 'quality_label', 'quality_label_encoded'], axis=1)  # Eliminar 'quality' y 'quality_label'
y = vinotinto_df['quality_label_encoded']

# Balanceo de clases: calculamos los pesos de las clases
class_weights = compute_class_weight('balanced', classes=np.array([0, 1, 2]), y=y)
class_weight_dict = dict(enumerate(class_weights))

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Aplicar SMOTE para balancear las clases
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Ver el balance de clases después de SMOTE
print(f"Antes de SMOTE: {Counter(y_train)}")
print(f"Después de SMOTE: {Counter(y_res)}")


Antes de SMOTE: Counter({1: 448, 2: 374, 0: 129})
Después de SMOTE: Counter({2: 448, 1: 448, 0: 448})


Regresión Logística con Pesos de Clase

In [28]:
from sklearn.linear_model import LogisticRegression

# Inicializar el modelo con pesos de clase
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)

# Entrenar el modelo con los datos balanceados por peso de clase
logreg.fit(X_train, y_train)


# Predecir y evaluar
y_pred = logreg.predict(X_test)
print("Regresión Logística:")
print(classification_report(y_test, y_pred))

Regresión Logística:
              precision    recall  f1-score   support

           0       0.45      0.75      0.56        55
           1       0.68      0.72      0.70       192
           2       0.51      0.36      0.42       161

    accuracy                           0.58       408
   macro avg       0.55      0.61      0.56       408
weighted avg       0.58      0.58      0.57       408



Árbol de Decisión con Pesos de Clase:

In [29]:
from sklearn.tree import DecisionTreeClassifier

# Inicializar el modelo con pesos de clase
dtree = DecisionTreeClassifier(class_weight='balanced')

# Entrenar el modelo con los datos balanceados por peso de clase
dtree.fit(X_train, y_train)


# Predecir y evaluar
y_pred = dtree.predict(X_test)
print("Árbol de Decisión:")
print(classification_report(y_test, y_pred))

Árbol de Decisión:
              precision    recall  f1-score   support

           0       0.40      0.38      0.39        55
           1       0.61      0.65      0.63       192
           2       0.42      0.39      0.40       161

    accuracy                           0.51       408
   macro avg       0.48      0.47      0.48       408
weighted avg       0.51      0.51      0.51       408



SVM con Pesos de Clase:

In [30]:
from sklearn.svm import SVC

# Inicializar el modelo con pesos de clase
svm = SVC(class_weight='balanced')

# Entrenar el modelo con los datos balanceados por peso de clase
svm.fit(X_train, y_train)


# Predecir y evaluar
y_pred = rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred))


Random Forest:
              precision    recall  f1-score   support

           0       0.71      0.36      0.48        55
           1       0.67      0.73      0.70       192
           2       0.51      0.55      0.53       161

    accuracy                           0.61       408
   macro avg       0.63      0.55      0.57       408
weighted avg       0.62      0.61      0.61       408



KNN con Pesos de Clase:

In [31]:
from sklearn.linear_model import LogisticRegression

# Inicializar el modelo con pesos de clase
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)

# Entrenar el modelo con los datos balanceados por peso de clase
logreg.fit(X_train, y_train)



In [32]:
# Predecir y evaluar
y_pred = svm.predict(X_test)
print("Support Vector Machine:")
print(classification_report(y_test, y_pred))

Support Vector Machine:
              precision    recall  f1-score   support

           0       0.20      0.62      0.30        55
           1       0.72      0.33      0.46       192
           2       0.45      0.42      0.43       161

    accuracy                           0.40       408
   macro avg       0.46      0.46      0.40       408
weighted avg       0.54      0.40      0.43       408



# PRUEBAS CON SMOTE

In [33]:
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

# Cargar el dataset
vinotinto_df = pd.read_csv('../data/processed/vinotinto.csv')

# Convertir quality_label a valores numéricos
label_encoder = LabelEncoder()
vinotinto_df['quality_label_encoded'] = label_encoder.fit_transform(vinotinto_df['quality_label'])

# Definir características (X) y objetivo (y)
X = vinotinto_df.drop(['quality', 'quality_label', 'quality_label_encoded'], axis=1)
y = vinotinto_df['quality_label_encoded']

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Aplicar SMOTE para balancear las clases
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Ver el balance de clases después de SMOTE
print(f"Antes de SMOTE: {Counter(y_train)}")
print(f"Después de SMOTE: {Counter(y_res)}")

# Entrenar modelos con SMOTE
models = {
    'Regresión Logística': LogisticRegression(max_iter=1000),
    'Árbol de Decisión': DecisionTreeClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(n_neighbors=3)
}

# Evaluar cada modelo
for name, model in models.items():
    model.fit(X_res, y_res)  # Entrenamiento con los datos balanceados
    y_pred = model.predict(X_test)  # Predicción en el conjunto de prueba
    print(f"\n{name}:")
    print(classification_report(y_test, y_pred))


Antes de SMOTE: Counter({1: 448, 2: 374, 0: 129})
Después de SMOTE: Counter({2: 448, 1: 448, 0: 448})


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Regresión Logística:
              precision    recall  f1-score   support

           0       0.45      0.75      0.56        55
           1       0.68      0.72      0.70       192
           2       0.51      0.36      0.42       161

    accuracy                           0.58       408
   macro avg       0.55      0.61      0.56       408
weighted avg       0.58      0.58      0.57       408


Árbol de Decisión:
              precision    recall  f1-score   support

           0       0.37      0.44      0.40        55
           1       0.68      0.62      0.65       192
           2       0.46      0.48      0.47       161

    accuracy                           0.54       408
   macro avg       0.50      0.51      0.51       408
weighted avg       0.55      0.54      0.55       408


SVM:
              precision    recall  f1-score   support

           0       0.22      0.65      0.33        55
           1       0.71      0.34      0.46       192
           2       0.48    

## Prueba con Pesos de Clase (Balanced Class Weight)

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

# Cargar el dataset
vinotinto_df = pd.read_csv('../data/processed/vinotinto.csv')

# Convertir quality_label a valores numéricos
label_encoder = LabelEncoder()
vinotinto_df['quality_label_encoded'] = label_encoder.fit_transform(vinotinto_df['quality_label'])

# Definir características (X) y objetivo (y)
X = vinotinto_df.drop(['quality', 'quality_label', 'quality_label_encoded'], axis=1)
y = vinotinto_df['quality_label_encoded']

# Balanceo de clases: calculamos los pesos de las clases
class_weights = compute_class_weight('balanced', classes=np.array([0, 1, 2]), y=y)
class_weight_dict = dict(enumerate(class_weights))

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Entrenar modelos con pesos de clase
models = {
    'Regresión Logística': LogisticRegression(class_weight=class_weight_dict, max_iter=1000),
    'Árbol de Decisión': DecisionTreeClassifier(class_weight='balanced'),
    'SVM': SVC(class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5)  # KNN no usa class_weight, por lo que lo dejamos sin cambiar
}

# Evaluar cada modelo
for name, model in models.items():
    model.fit(X_train, y_train)  # Entrenamiento con pesos de clase
    y_pred = model.predict(X_test)  # Predicción en el conjunto de prueba
    print(f"\n{name} con Pesos de Clase:")
    print(classification_report(y_test, y_pred))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Regresión Logística con Pesos de Clase:
              precision    recall  f1-score   support

           0       0.44      0.75      0.55        55
           1       0.68      0.72      0.70       192
           2       0.51      0.36      0.42       161

    accuracy                           0.58       408
   macro avg       0.55      0.61      0.56       408
weighted avg       0.58      0.58      0.57       408


Árbol de Decisión con Pesos de Clase:
              precision    recall  f1-score   support

           0       0.40      0.38      0.39        55
           1       0.62      0.66      0.64       192
           2       0.41      0.39      0.40       161

    accuracy                           0.51       408
   macro avg       0.48      0.48      0.48       408
weighted avg       0.51      0.51      0.51       408


SVM con Pesos de Clase:
              precision    recall  f1-score   support

           0       0.20      0.62      0.30        55
           1       0.72 

Optimización de Hiperparámetros (GridSearchCV)

Optimización para Regresión Logística:

In [35]:
from sklearn.linear_model import LogisticRegression

# Inicializar el modelo con pesos de clase
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)

# Entrenar el modelo con los datos balanceados por peso de clase
logreg.fit(X_train, y_train)


Optimización para Árbol de Decisión:

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


# Definir los parámetros a probar
param_grid = {'max_depth': [3, 5, 10, None], 'min_samples_split': [2, 10, 20]}

# Inicializar el modelo
dtree = DecisionTreeClassifier(class_weight='balanced')

# Aplicar GridSearchCV
grid_search_dtree = GridSearchCV(dtree, param_grid, cv=5, scoring='accuracy')
grid_search_dtree.fit(X_res, y_res)

# Mostrar los mejores parámetros
print("Mejores parámetros para Árbol de Decisión:", grid_search_dtree.best_params_)


Mejores parámetros para Árbol de Decisión: {'max_depth': None, 'min_samples_split': 2}


Optimización para SVM:

In [37]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


# Definir los parámetros a probar
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}

# Inicializar el modelo
svm = SVC(class_weight='balanced')

# Aplicar GridSearchCV
grid_search_svm = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search_svm.fit(X_res, y_res)

# Mostrar los mejores parámetros
print("Mejores parámetros para SVM:", grid_search_svm.best_params_)


Mejores parámetros para SVM: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}


Optimización para KNN:

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


# Definir los parámetros a probar
param_grid = {'n_neighbors': [3, 5, 7, 10], 'weights': ['uniform', 'distance']}

# Inicializar el modelo
knn = KNeighborsClassifier()

# Aplicar GridSearchCV
grid_search_knn = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search_knn.fit(X_res, y_res)

# Mostrar los mejores parámetros
print("Mejores parámetros para KNN:", grid_search_knn.best_params_)


Mejores parámetros para KNN: {'n_neighbors': 3, 'weights': 'distance'}


Evaluación Final de los Modelos con los Mejores Hiperparámetros

SMOTE: Balancea las clases generando ejemplos sintéticos para las clases minoritarias.

Optimización de Hiperparámetros: Utiliza GridSearchCV para ajustar los parámetros de los modelos.

Evaluación Final: Evalúa el rendimiento de los modelos optimizados y balanceados.

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Definir los parámetros a probar para la regresión logística
param_grid_logreg = {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']}

# Inicializar el modelo
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)

# Aplicar GridSearchCV
grid_search_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, scoring='accuracy')
grid_search_logreg.fit(X_res, y_res)

# Mostrar los mejores parámetros
print("Mejores parámetros para Regresión Logística:", grid_search_logreg.best_params_)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mejores parámetros para Regresión Logística: {'C': 10, 'solver': 'lbfgs'}


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
# Usar el mejor modelo encontrado en GridSearchCV
best_logreg = grid_search_logreg.best_estimator_
best_dtree = grid_search_dtree.best_estimator_
best_svm = grid_search_svm.best_estimator_
best_knn = grid_search_knn.best_estimator_

# Entrenar con los datos balanceados por SMOTE
best_logreg.fit(X_res, y_res)
best_dtree.fit(X_res, y_res)
best_svm.fit(X_res, y_res)
best_knn.fit(X_res, y_res)

# Predecir en el conjunto de prueba
y_pred_logreg = best_logreg.predict(X_test)
y_pred_dtree = best_dtree.predict(X_test)
y_pred_svm = best_svm.predict(X_test)
y_pred_knn = best_knn.predict(X_test)

# Evaluar el rendimiento
from sklearn.metrics import classification_report

print("Evaluación Regresión Logística:")
print(classification_report(y_test, y_pred_logreg))

print("Evaluación Árbol de Decisión:")
print(classification_report(y_test, y_pred_dtree))

print("Evaluación SVM:")
print(classification_report(y_test, y_pred_svm))

print("Evaluación KNN:")
print(classification_report(y_test, y_pred_knn))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluación Regresión Logística:
              precision    recall  f1-score   support

           0       0.47      0.78      0.59        55
           1       0.68      0.71      0.69       192
           2       0.52      0.38      0.44       161

    accuracy                           0.59       408
   macro avg       0.56      0.62      0.57       408
weighted avg       0.59      0.59      0.58       408

Evaluación Árbol de Decisión:
              precision    recall  f1-score   support

           0       0.35      0.42      0.38        55
           1       0.68      0.59      0.63       192
           2       0.45      0.49      0.47       161

    accuracy                           0.53       408
   macro avg       0.49      0.50      0.50       408
weighted avg       0.55      0.53      0.54       408

Evaluación SVM:
              precision    recall  f1-score   support

           0       0.36      0.51      0.42        55
           1       0.63      0.58      0.60       1

_________________________________________

In [41]:
# Cargar las librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from collections import Counter

# Cargar el dataset
vinotinto_df = pd.read_csv('../data/processed/vinotinto.csv')

# Convertir quality_label a valores numéricos
label_encoder = LabelEncoder()
vinotinto_df['quality_label_encoded'] = label_encoder.fit_transform(vinotinto_df['quality_label'])

# Definir características (X) y variable objetivo (y)
X = vinotinto_df.drop(['quality', 'quality_label', 'quality_label_encoded'], axis=1)
y = vinotinto_df['quality_label_encoded']

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Balanceo de clases con SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Ver el balance de clases después de SMOTE
print(f"Antes de SMOTE: {Counter(y_train)}")
print(f"Después de SMOTE: {Counter(y_res)}")

# Entrenar los modelos
models = {
    'Regresión Logística': LogisticRegression(max_iter=1000),
    'Árbol de Decisión': DecisionTreeClassifier(class_weight='balanced'),
    'Random Forest': RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42),
    'SVM': SVC(class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Evaluar cada modelo
for name, model in models.items():
    model.fit(X_res, y_res)  # Entrenamiento con los datos balanceados por SMOTE
    y_pred = model.predict(X_test)  # Predicción en el conjunto de prueba
    print(f"\n{name} con SMOTE:")
    print(classification_report(y_test, y_pred))



Antes de SMOTE: Counter({1: 448, 2: 374, 0: 129})
Después de SMOTE: Counter({2: 448, 1: 448, 0: 448})


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Regresión Logística con SMOTE:
              precision    recall  f1-score   support

           0       0.45      0.75      0.56        55
           1       0.68      0.72      0.70       192
           2       0.51      0.36      0.42       161

    accuracy                           0.58       408
   macro avg       0.55      0.61      0.56       408
weighted avg       0.58      0.58      0.57       408


Árbol de Decisión con SMOTE:
              precision    recall  f1-score   support

           0       0.38      0.40      0.39        55
           1       0.67      0.61      0.64       192
           2       0.47      0.51      0.49       161

    accuracy                           0.54       408
   macro avg       0.51      0.51      0.51       408
weighted avg       0.55      0.54      0.55       408


Random Forest con SMOTE:
              precision    recall  f1-score   support

           0       0.54      0.64      0.58        55
           1       0.72      0.70      0.

_____________________________

In [42]:
# Cargar las librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from collections import Counter

# Cargar el dataset
vinotinto_df = pd.read_csv('../data/processed/vinotinto.csv')

# Convertir quality_label a valores numéricos
label_encoder = LabelEncoder()
vinotinto_df['quality_label_encoded'] = label_encoder.fit_transform(vinotinto_df['quality_label'])

# Definir características (X) y variable objetivo (y)
X = vinotinto_df.drop(['quality', 'quality_label', 'quality_label_encoded'], axis=1)
y = vinotinto_df['quality_label_encoded']

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Balanceo de clases con SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Ver el balance de clases después de SMOTE
print(f"Antes de SMOTE: {Counter(y_train)}")
print(f"Después de SMOTE: {Counter(y_res)}")

# Entrenar los modelos
models = {
    'Regresión Logística': LogisticRegression(max_iter=1000),
    'Árbol de Decisión': DecisionTreeClassifier(class_weight='balanced'),
    'Random Forest': RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42),
    'SVM': SVC(class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Evaluar cada modelo
for name, model in models.items():
    model.fit(X_res, y_res)  # Entrenamiento con los datos balanceados por SMOTE
    y_pred = model.predict(X_test)  # Predicción en el conjunto de prueba
    print(f"\n{name} con SMOTE:")
    print(classification_report(y_test, y_pred))



Antes de SMOTE: Counter({1: 448, 2: 374, 0: 129})
Después de SMOTE: Counter({2: 448, 1: 448, 0: 448})


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Regresión Logística con SMOTE:
              precision    recall  f1-score   support

           0       0.45      0.75      0.56        55
           1       0.68      0.72      0.70       192
           2       0.51      0.36      0.42       161

    accuracy                           0.58       408
   macro avg       0.55      0.61      0.56       408
weighted avg       0.58      0.58      0.57       408


Árbol de Decisión con SMOTE:
              precision    recall  f1-score   support

           0       0.36      0.44      0.39        55
           1       0.66      0.61      0.63       192
           2       0.45      0.45      0.45       161

    accuracy                           0.52       408
   macro avg       0.49      0.50      0.49       408
weighted avg       0.54      0.52      0.53       408


Random Forest con SMOTE:
              precision    recall  f1-score   support

           0       0.54      0.64      0.58        55
           1       0.72      0.70      0.

In [43]:
from sklearn.model_selection import GridSearchCV

# Definir los parámetros a probar para cada modelo

# Regresión Logística
param_grid_logreg = {'C': [0.1, 1, 10, 20, 100], 'solver': ['lbfgs', 'liblinear']}
logreg = LogisticRegression(class_weight='balanced', max_iter=10000)
grid_search_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, scoring='accuracy')
grid_search_logreg.fit(X_res, y_res)
print("Mejores parámetros para Regresión Logística:", grid_search_logreg.best_params_)

# Árbol de Decisión
param_grid_dtree = {'max_depth': [3, 5, 10, 20, None], 'min_samples_split': [2, 10, 20, 40]}
dtree = DecisionTreeClassifier(class_weight='balanced')
grid_search_dtree = GridSearchCV(dtree, param_grid_dtree, cv=5, scoring='accuracy')
grid_search_dtree.fit(X_res, y_res)
print("Mejores parámetros para Árbol de Decisión:", grid_search_dtree.best_params_)

# Random Forest
param_grid_rf = {'n_estimators': [50, 100, 200, 500, 1000], 'max_depth': [3, 5, 10, 20, None]}
rf = RandomForestClassifier(class_weight='balanced')
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_res, y_res)
print("Mejores parámetros para Random Forest:", grid_search_rf.best_params_)

# SVM
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
svm = SVC(class_weight='balanced')
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_res, y_res)
print("Mejores parámetros para SVM:", grid_search_svm.best_params_)

# KNN
param_grid_knn = {'n_neighbors': [3, 5, 7, 10], 'weights': ['uniform', 'distance']}
knn = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='accuracy')
grid_search_knn.fit(X_res, y_res)
print("Mejores parámetros para KNN:", grid_search_knn.best_params_)


Mejores parámetros para Regresión Logística: {'C': 100, 'solver': 'lbfgs'}
Mejores parámetros para Árbol de Decisión: {'max_depth': 20, 'min_samples_split': 2}
Mejores parámetros para Random Forest: {'max_depth': None, 'n_estimators': 500}
Mejores parámetros para SVM: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Mejores parámetros para KNN: {'n_neighbors': 3, 'weights': 'distance'}


In [44]:
import xgboost as xgb
from sklearn.metrics import classification_report

# Calcular el valor de scale_pos_weight
# Para desbalance, el valor se ajusta a la relación entre el número de muestras de la clase mayoritaria y la clase minoritaria
scale_pos_weight = len(y_res) / (3 * np.bincount(y_res))

# Inicializar y entrenar el modelo XGBoost con scale_pos_weight
xgb_model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_res, y_res)

# Predecir en el conjunto de prueba
y_pred_xgb = xgb_model.predict(X_test)

# Evaluar el rendimiento
print("Evaluación XGBoost:")
print(classification_report(y_test, y_pred_xgb))



Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluación XGBoost:
              precision    recall  f1-score   support

           0       0.48      0.56      0.52        55
           1       0.70      0.68      0.69       192
           2       0.49      0.48      0.48       161

    accuracy                           0.59       408
   macro avg       0.56      0.57      0.57       408
weighted avg       0.59      0.59      0.59       408



In [45]:
# Usar los mejores modelos encontrados en GridSearchCV
best_logreg = grid_search_logreg.best_estimator_
best_dtree = grid_search_dtree.best_estimator_
best_rf = grid_search_rf.best_estimator_
best_svm = grid_search_svm.best_estimator_
best_knn = grid_search_knn.best_estimator_

# Entrenar con los datos balanceados por SMOTE
best_logreg.fit(X_res, y_res)
best_dtree.fit(X_res, y_res)
best_rf.fit(X_res, y_res)
best_svm.fit(X_res, y_res)
best_knn.fit(X_res, y_res)

# Predecir en el conjunto de prueba
y_pred_logreg = best_logreg.predict(X_test)
y_pred_dtree = best_dtree.predict(X_test)
y_pred_rf = best_rf.predict(X_test)
y_pred_svm = best_svm.predict(X_test)
y_pred_knn = best_knn.predict(X_test)

# Evaluar el rendimiento de cada modelo
from sklearn.metrics import classification_report

print("Evaluación Regresión Logística:")
print(classification_report(y_test, y_pred_logreg))

print("Evaluación Árbol de Decisión:")
print(classification_report(y_test, y_pred_dtree))

print("Evaluación Random Forest:")
print(classification_report(y_test, y_pred_rf))

print("Evaluación SVM:")
print(classification_report(y_test, y_pred_svm))

print("Evaluación KNN:")
print(classification_report(y_test, y_pred_knn))


Evaluación Regresión Logística:
              precision    recall  f1-score   support

           0       0.49      0.78      0.60        55
           1       0.68      0.71      0.70       192
           2       0.54      0.40      0.46       161

    accuracy                           0.60       408
   macro avg       0.57      0.63      0.59       408
weighted avg       0.60      0.60      0.59       408

Evaluación Árbol de Decisión:
              precision    recall  f1-score   support

           0       0.37      0.40      0.39        55
           1       0.67      0.63      0.65       192
           2       0.47      0.49      0.48       161

    accuracy                           0.54       408
   macro avg       0.50      0.51      0.51       408
weighted avg       0.55      0.54      0.55       408

Evaluación Random Forest:
              precision    recall  f1-score   support

           0       0.49      0.65      0.56        55
           1       0.73      0.70      0.