In [50]:
import sys
import os

import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score

import numpy as np
import pandas as pd
import seaborn as sns

# sys.path.append(os.path.abspath("../.."))
# from wine_quality import dataset

The objective of this EDA is to visualize the patterns in the wine dataset and their interaction with quality. The question to be answered is: what are the components of win that affect its quality.

This part combines the red and white datasets and adds a new feature (is_red) to distinguish the color of the wine. Each numerical feature was standardized to better visualize the data.

In [None]:
# Load datasets using .env variable
# red_raw_df = dataset.load_raw_data("winequality-red.csv", ";")
# white_raw_df = dataset.load_raw_data("winequality-white.csv", ";")

#Load datasets locally
red_raw_df = pd.read_csv("../../data/raw/winequality-red.csv", delimiter=";")
white_raw_df = pd.read_csv("../../data/raw/winequality-white.csv", delimiter=";")


# Add 'is_red' column (1 for red, 0 for white)
red_raw_df["is_red"] = 1
white_raw_df["is_red"] = 0

# Combine datasets
combined_raw_df = pd.concat([red_raw_df, white_raw_df], ignore_index=True)

display(combined_raw_df)
# # Select features for scaling and encoding. Removed encoding, since there are not categorical values
# bool_features = combined_raw_df.select_dtypes(include=[np.number, 'bool']).columns[combined_raw_df.nunique() == 2].tolist()
# # cat_features = combined_raw_df.select_dtypes(include=["object"]).columns.tolist() 

# print(num_features)
# print(bool_features)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


Model Training
1) Categorization model for predicting whether wine is red or white
2) Regression model for predicting quality for all wines
3) Regression model for predicting quality for red wines
4) Regression model for predicting quality for white wines

Reasoning for using regression for quality prediction.

At first glance, there are three ways of predicting quality, First, is categorization with each quality as a class (3,4,5,6,7,and 8). This however wil be more complex than it should be because of its multi-class nature. Second is condensing quality into three levels: low, medium, and high. This is simpler because there a smaller number of categories but I am not an expert on classifying wines in such ways and I am not sure on which quality levels should be in low, medium, or high. The third option is regression; this option outputs a real number instead of a class. So it might predict a wine to be 4.5, 6.9, or 3.2. These quality levels are not present in the dataset but is still very useful since it treats quality as a scale.

In [None]:
# Choose target column
X = combined_raw_df.drop(columns=["is_red"])  # Features
y = combined_raw_df["is_red"]

num_features = X.select_dtypes(include=[np.number]).columns[X.nunique() > 2].tolist()

cat_preprocessor = ColumnTransformer(
    [
        ("num_scaler", StandardScaler(), num_features),  # Scale numerical
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define Logistic Regression with automatic cross-validation
model = LogisticRegressionCV(
    Cs=np.logspace(-3, 3, 7),
    max_iter=500,
    random_state=42
)

# Define full pipeline
pipeline = Pipeline([
    ("preprocess", cat_preprocessor),
    ("classifier", model)
])

# Train the pipeline on training data
pipeline.fit(X_train, y_train)

# Predict on test data
y_pred = pipeline.predict(X_test)

# Print Best C Values for Each Class
print(f"Best C values per class: {pipeline.named_steps['classifier'].C_}")

# Model Evaluation
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Best C values per class: [10.]
Test Accuracy: 1.00

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       980
           1       0.99      0.99      0.99       320

    accuracy                           1.00      1300
   macro avg       0.99      0.99      0.99      1300
weighted avg       1.00      1.00      1.00      1300



In [51]:
# Choose target column
X = combined_raw_df.drop(columns=["quality"])  # Features
y = combined_raw_df["quality"]

num_features = X.select_dtypes(include=[np.number]).columns[X.nunique() > 2].tolist()
preprocessor = ColumnTransformer(
    [
        ("num_scaler", StandardScaler(), num_features),  # Scale numerical
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Identify numerical columns (all are numerical in this dataset)
num_features = list(range(X.shape[1]))

# Define Linear Regression Model
model = LinearRegression()

# Define full pipeline
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", model)
])

# Stratified K-Fold Cross-Validation on Training Data (70%)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_mse = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='neg_mean_squared_error')
cross_val_r2 = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='r2')

print(f"Cross-Validation MSE: {-np.mean(cross_val_mse):.4f}")
print(f"Cross-Validation R²: {np.mean(cross_val_r2):.4f}")

# Train the final model on the full training data (70%)
pipeline.fit(X_train, y_train)

# Final Testing on 30% Test Set
y_test_pred = pipeline.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²: {test_r2:.4f}")

Cross-Validation MSE: 0.5435
Cross-Validation R²: 0.2869
Test MSE: 0.5364
Test R²: 0.2975




The difference in chemical composition between red and white might have an effect to the quality level. The chemical composition of a high quality red wine might be different form a high quality white wine. Thus, two separated models were created for red and white.

In [52]:
# Choose target column
X = red_raw_df.drop(columns=["quality"])  # Features
y = red_raw_df["quality"]

num_features = X.select_dtypes(include=[np.number]).columns[X.nunique() > 2].tolist()
preprocessor = ColumnTransformer(
    [
        ("num_scaler", StandardScaler(), num_features),  # Scale numerical
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Identify numerical columns (all are numerical in this dataset)
num_features = list(range(X.shape[1]))

# Define Linear Regression Model
model = LinearRegression()

# Define full pipeline
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", model)
])

# Stratified K-Fold Cross-Validation on Training Data (70%)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_mse = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='neg_mean_squared_error')
cross_val_r2 = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='r2')

print(f"Cross-Validation MSE: {-np.mean(cross_val_mse):.4f}")
print(f"Cross-Validation R²: {np.mean(cross_val_r2):.4f}")

# Train the final model on the full training data (70%)
pipeline.fit(X_train, y_train)

# Final Testing on 30% Test Set
y_test_pred = pipeline.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²: {test_r2:.4f}")

Cross-Validation MSE: 0.4276
Cross-Validation R²: 0.3458
Test MSE: 0.4064
Test R²: 0.3703


Model for red did better than the combined model with an r2 score of 0.3703 and 0.2975 respectively.

In [53]:

# Choose target column
X = white_raw_df.drop(columns=["quality"])  # Features
y = white_raw_df["quality"]

num_features = X.select_dtypes(include=[np.number]).columns[X.nunique() > 2].tolist()
preprocessor = ColumnTransformer(
    [
        ("num_scaler", StandardScaler(), num_features),  # Scale numerical
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Identify numerical columns (all are numerical in this dataset)
num_features = list(range(X.shape[1]))

# Define Linear Regression Model
model = LinearRegression()

# Define full pipeline
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", model)
])

# Stratified K-Fold Cross-Validation on Training Data (70%)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_mse = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='neg_mean_squared_error')
cross_val_r2 = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='r2')

print(f"Cross-Validation MSE: {-np.mean(cross_val_mse):.4f}")
print(f"Cross-Validation R²: {np.mean(cross_val_r2):.4f}")

# Train the final model on the full training data (70%)
pipeline.fit(X_train, y_train)

# Final Testing on 30% Test Set
y_test_pred = pipeline.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R²: {test_r2:.4f}")

Cross-Validation MSE: 0.5672
Cross-Validation R²: 0.2766
Test MSE: 0.5794
Test R²: 0.2619




Model for white however did poorer than the combined model and the red model suggesting that the quality for white wines is harder to predict with linear regression.