In [2]:
# Primero empezaré com el análisis de los datos
import pandas as pd
import numpy as np

df = pd.read_csv('../Data/winequality-red.csv')
print(df.head()) #Para ver si está todo bien cargado

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [3]:
print(df.info()) #Analizo los valores nulos

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 1 columns):
 #   Column                                                                                                                                                                   Non-Null Count  Dtype 
---  ------                                                                                                                                                                   --------------  ----- 
 0   fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"  1599 non-null   object
dtypes: object(1)
memory usage: 12.6+ KB
None


In [8]:
print(df.isnull().sum())

fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"    0
dtype: int64


In [5]:
print(df.dtypes) #Para ver los tipos de datos que tenemos

fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"    object
dtype: object


In [6]:
#Por lo que puedo ver están bien los datos y no tenemos valores nulos.. veremos duplicados
print(df.duplicated().sum())

240


In [9]:
#No tenemos duplicados, por ahora todo perfecto.
#Ahora vamos a ver si tenemos outliers
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

fig = px.box(df, y='quality')
fig.show()

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'] but received: quality

In [None]:
#Haré gráficos para ver como se comportan los datos
fig = px.histogram(df, x='pH')
fig.show()

In [None]:
#Haré una correlación para ver cuales son los más influyentes para 'quality'
correlation_matrix = df.corr(method = 'pearson')
plt.figure(figsize = (12, 10))
sns.heatmap(correlation_matrix, annot = True, cmap = 'coolwarm', center = 0)
plt.title('Mapa de Calor de la Correlación de Pearson')
plt.show()

In [None]:
#Empezaremos el preprocesamiento manteniendo todas las columnas excepto ID y analizaré el rendimiento
#Si veo que es muy bajo veo para eliminar algunas columnas
df = df.drop(['Id'], axis = 1)
print(df.head())

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer

X = df.drop(columns=['quality']) #Con esto selecciono todas las columnas, excepto quality
y = df['quality'] #Esta es mi variable objetivo

#Ahora divido el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Divido las características numéricas y categóricas
num_columns = X.select_dtypes(include=['float64', 'int64']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_columns),
    ]
)

#Ahora hago el pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

#Ajusto ahora el pipeline a los datos de entrenamiento
pipeline.fit(X_train, y_train)

In [None]:
#Ahora empiezo con los modelos predictivos


linear_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio LinearRegression: {mse}')
print(f'R-cuadrado LinearRegression: {r2}')

#Voy con otro modelo ahora
#Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators = 100, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio Random Forest: {mse}')
print(f'R-cuadrado Random Forest: {r2}')

In [None]:
#En esto tenemos que el RandomForest es el mejor para este caso porque presenta unos mejores
#resultados, pero trataremos de mejorarlo
#KNN

knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

knn_pipeline.fit(X_train, y_train)
y_pred = knn_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio KNN: {mse}')
print(f'R-cuadrado KNN: {r2}')

In [None]:
#Sigue siendo Random Forest el mejor
#Utilizaré validación cruzada para seleccionar los mehores hiperparámetros
from sklearn.model_selection import cross_val_score

# Probar diferentes valores de K
k_values = list(range(1, 21))
mse_scores = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    mse_scores.append(scores.mean())

# Seleccionar el K con el menor MSE
best_k = k_values[mse_scores.index(max(mse_scores))]
print(f'Mejor valor de K: {best_k}')

In [None]:
#Ahora voy a probar el modelo con la mejor k
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=15))
])

knn_pipeline.fit(X_train, y_train)
y_pred = knn_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio KNN: {mse}')
print(f'R-cuadrado KNN: {r2}')

In [None]:
#Haré el árbol de decisión 
from sklearn.tree import DecisionTreeRegressor

tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

tree_pipeline.fit(X_train, y_train)
y_pred = tree_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio Decision Tree: {mse}')
print(f'R-cuadrado Decision Tree: {r2}')

In [None]:
#Sigue siendo RandomForest el mejor
#Trataré de mejorar el modelo eliminando algunas columnas primeramente 
df2 = df.drop(columns = ['residual sugar', 'free sulfur dioxide', 'pH'])

#Ahora hago todo de nuevo
X = df2.drop(columns=['quality']) #Con esto selecciono todas las columnas, excepto quality
y = df2['quality'] #Esta es mi variable objetivo

#Ahora divido el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Divido las características numéricas y categóricas
num_columns = X.select_dtypes(include=['float64', 'int64']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_columns),
    ]
)

#Ahora hago el pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

#Ajusto ahora el pipeline a los datos de entrenamiento
pipeline.fit(X_train, y_train)

#Ahora empiezo con los modelos predictivos
from sklearn.metrics import mean_squared_error, r2_score

linear_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio LinearRegression: {mse}')
print(f'R-cuadrado LinearRegression: {r2}')

#Voy con otro modelo ahora
#Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators = 100, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio Random Forest: {mse}')
print(f'R-cuadrado Random Forest: {r2}')

#KNN

knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

knn_pipeline.fit(X_train, y_train)
y_pred = knn_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio KNN: {mse}')
print(f'R-cuadrado KNN: {r2}')

# Probar diferentes valores de K
k_values = list(range(1, 21))
mse_scores = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    mse_scores.append(scores.mean())

# Seleccionar el K con el menor MSE
best_k = k_values[mse_scores.index(max(mse_scores))]
print(f'Mejor valor de K: {best_k}')

In [None]:
#Ahora voy a probar el modelo con la mejor k
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=8))
])

knn_pipeline.fit(X_train, y_train)
y_pred = knn_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio KNN: {mse}')
print(f'R-cuadrado KNN: {r2}')

#Haré el árbol de decisión 
from sklearn.tree import DecisionTreeRegressor

tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

tree_pipeline.fit(X_train, y_train)
y_pred = tree_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio Decision Tree: {mse}')
print(f'R-cuadrado Decision Tree: {r2}')

In [None]:
#No consigo mejorar mucho haciendo esto, probaré eliminando outliers
fig = px.box(df2, y='fixed acidity')
fig.show()

In [None]:
df2 = df2[(df2['fixed acidity'] >= 4.6) & (df2['fixed acidity'] <= 12)]
fig = px.box(df2, y='volatile acidity')
fig.show()

In [None]:
df2 = df2[(df2['volatile acidity'] >= 0.12) & (df2['volatile acidity'] <= 1.005)]
fig = px.box(df2, y='citric acid')
fig.show()

In [None]:
df2 = df2[(df2['citric acid'] >= 0) & (df2['citric acid'] <= 0.76)]
fig = px.box(df2, y='chlorides')
fig.show()

In [None]:
#Estos son demasiados outliers, eliminaré demasiado datos si los elimino
#Eliminaré solo los que están muy fuera
df2 = df2[(df2['chlorides'] >= 0.012) & (df2['chlorides'] < 0.332)]
fig = px.box(df2, y='total sulfur dioxide')
fig.show()

In [None]:
#Eliminaré solo los que están muy fuera
df2 = df2[(df2['total sulfur dioxide'] >= 6) & (df2['total sulfur dioxide'] < 165)]
fig = px.box(df2, y='density')
fig.show()

In [None]:
#Eliminaré solo los que están muy fuera
df2 = df2[(df2['density'] >= 0.99154) & (df2['density'] < 1.001)]
fig = px.box(df2, y='sulphates')
fig.show()

In [None]:
#Eliminaré solo los que están muy fuera
df2 = df2[(df2['sulphates'] >= 0.33) & (df2['sulphates'] < 1.36)]
fig = px.box(df2, y='alcohol')
fig.show()

In [None]:
df2 = df2[(df2['alcohol'] >= 8.5) & (df2['alcohol'] <= 13.4)]
#Ahora vuelvo a hacer la prediccion
X = df2.drop(columns=['quality']) #Con esto selecciono todas las columnas, excepto quality
y = df2['quality'] #Esta es mi variable objetivo

#Ahora divido el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Divido las características numéricas y categóricas
num_columns = X.select_dtypes(include=['float64', 'int64']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_columns),
    ]
)

#Ahora hago el pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

#Ajusto ahora el pipeline a los datos de entrenamiento
pipeline.fit(X_train, y_train)

#Ahora empiezo con los modelos predictivos
from sklearn.metrics import mean_squared_error, r2_score

linear_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio LinearRegression: {mse}')
print(f'R-cuadrado LinearRegression: {r2}')

#Voy con otro modelo ahora
#Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators = 100, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio Random Forest: {mse}')
print(f'R-cuadrado Random Forest: {r2}')

#KNN

knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

knn_pipeline.fit(X_train, y_train)
y_pred = knn_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio KNN: {mse}')
print(f'R-cuadrado KNN: {r2}')

# Probar diferentes valores de K
k_values = list(range(1, 21))
mse_scores = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    mse_scores.append(scores.mean())

# Seleccionar el K con el menor MSE
best_k = k_values[mse_scores.index(max(mse_scores))]
print(f'Mejor valor de K: {best_k}')

In [None]:
#Ahora voy a probar el modelo con la mejor k
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=16))
])

knn_pipeline.fit(X_train, y_train)
y_pred = knn_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio KNN: {mse}')
print(f'R-cuadrado KNN: {r2}')

#Haré el árbol de decisión 
from sklearn.tree import DecisionTreeRegressor

tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

tree_pipeline.fit(X_train, y_train)
y_pred = tree_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio Decision Tree: {mse}')
print(f'R-cuadrado Decision Tree: {r2}')

In [None]:
# Eliminar los outliers empeoró la predicción, así que me quedo con los outliers y elijo la mejor
#que fue el randomForest del primer Dataset
#Convertiré quality a una variable categórica
df['quality'] = df['quality'].apply(lambda x: 1 if x >=6 else 0)

#Con esto decimos que más o igual a 6 es alta calidad y menos es de baja calidad
#Redefino los conjuntos de entrenamiento y prueba
X = df.drop(columns=['quality'])
y = df['quality']

print(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Ya se que el random Forest es el mejor, usaré RandomForestClassifier

num_columns = X.select_dtypes(include=['float64', 'int64']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_columns),
    ]
)

#Ahora hago el pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

rf_pipeline_best_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators = 100, random_state=42))
])

rf_pipeline_best_model.fit(X_train, y_train)
y_pred = rf_pipeline_best_model.predict(X_test)

#Ahora los evalúo en los modelos de clasificación
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

y_pred_rf = rf_pipeline_best_model.predict(X_test)

#Calculo las metricas de clasificación
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

#Generar el informe de clasificacion
report = classification_report(y_test, y_pred_rf)
print(report)

#Hago la matriz de confusión
print(f'La matriz de confusión para Random Forest es:')
print(confusion_matrix(y_test, y_pred_rf))