In [15]:
import os
import joblib
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

CSV_PATH = "../data/winequality-red.csv"
def load_csv(path=CSV_PATH):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at {path}. Please place dataset as {path} (relative to the project root).")
    try:
        df = pd.read_csv(path, sep=';')
        if df.shape[1] == 1:
            df = pd.read_csv(path, sep=',')
    except Exception:
        df = pd.read_csv(path, sep=',')
    return df

df = load_csv()
print("Loaded:", df.shape)
df.head()

Loaded: (1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [16]:
# Basic EDA
df.info()
df.describe().T
print("Missing values:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
Missing values:
 fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlor

In [17]:
import matplotlib.pyplot as plt
%matplotlib inline

plots_dir = "./plots"
os.makedirs(plots_dir, exist_ok=True)
for col in df.columns:
    plt.figure(figsize=(6,3))
    df[col].hist(bins=30)
    plt.title(col)
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f"hist_{col}.png"))
    plt.close()
print("Saved histograms to", plots_dir)


Saved histograms to ./plots


In [18]:
if "quality" not in df.columns:
    raise ValueError("Dataset must contain 'quality' column as target.")

X = df.drop(columns=["quality"])
y = df["quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train/test:", X_train.shape, X_test.shape)

# Models
models = {
    "LinearRegression": Pipeline([("scaler", StandardScaler()), ("lr", LinearRegression())]),
    "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=100),
    "GradientBoosting": GradientBoostingRegressor(random_state=42, n_estimators=100)
}

rf_grid = {"n_estimators": [100,150], "max_depth":[None,10]}
gb_grid = {"n_estimators": [100,150], "learning_rate":[0.05,0.1], "max_depth":[3,5]}

rf_gs = GridSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1),
                     rf_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
rf_gs.fit(X_train, y_train)
print("RF best params:", rf_gs.best_params_)

gb_gs = GridSearchCV(GradientBoostingRegressor(random_state=42),
                     gb_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
gb_gs.fit(X_train, y_train)
print("GB best params:", gb_gs.best_params_)

candidates = {
    "LinearRegression": models["LinearRegression"],
    "RandomForest_Tuned": rf_gs.best_estimator_,
    "GradientBoosting_Tuned": gb_gs.best_estimator_
}

metrics = {}
for name, model in candidates.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    metrics[name] = {"RMSE": rmse, "MAE": mae, "R2": r2}
    print(f"{name}: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}")

best_name = min(metrics, key=lambda k: metrics[k]["RMSE"])
best_model = candidates[best_name]
print("Selected:", best_name, metrics[best_name])


Train/test: (1279, 11) (320, 11)
RF best params: {'max_depth': None, 'n_estimators': 100}
RF best params: {'max_depth': None, 'n_estimators': 100}
GB best params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 150}
LinearRegression: RMSE=0.6245, MAE=0.5035, R2=0.4032
GB best params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 150}
LinearRegression: RMSE=0.6245, MAE=0.5035, R2=0.4032
RandomForest_Tuned: RMSE=0.5489, MAE=0.4224, R2=0.5390
RandomForest_Tuned: RMSE=0.5489, MAE=0.4224, R2=0.5390
GradientBoosting_Tuned: RMSE=0.5980, MAE=0.4689, R2=0.4527
Selected: RandomForest_Tuned {'RMSE': 0.5488516420673258, 'MAE': 0.4224375, 'R2': 0.5390429623873638}
GradientBoosting_Tuned: RMSE=0.5980, MAE=0.4689, R2=0.4527
Selected: RandomForest_Tuned {'RMSE': 0.5488516420673258, 'MAE': 0.4224375, 'R2': 0.5390429623873638}


In [19]:
joblib.dump(best_model, "../model.pkl")  # saved to project root from notebooks/
print("Saved ../model.pkl")

with open("../test_metrics.txt", "w") as f:
    for k,v in metrics.items():
        f.write(f"{k}: RMSE={v['RMSE']:.4f}, MAE={v['MAE']:.4f}, R2={v['R2']:.4f}\n")
    f.write(f"\nSelected: {best_name}\n")
print("Saved ../test_metrics.txt")


Saved ../model.pkl
Saved ../test_metrics.txt
