In [1]:
# === wine_quality_training.py ===
# Run in a notebook or as a script. Adjust file paths as needed.

import pandas as pd
import numpy as np
from pathlib import Path

# ML imports
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, classification_report)
import joblib

# 1) Load raw CSV (change path to your CSV)
data_path = Path("../data/winequality-red.csv")  # adjust
df = pd.read_csv(data_path)
print("Raw shape:", df.shape)
display(df.head())

# 2) Basic cleanup / EDA
# - Standardize column names
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]

# - Identify missing values (the dataset often has no missing, but you said some are incomplete)
print("Missing per column:\n", df.isna().sum())

# If there are obvious bad rows (e.g., blank lines), remove them
df = df.dropna(how='all')

# 3) Define binary target: good (quality >= 7) vs not good (<7)
df['target'] = (df['quality'] >= 7).astype(int)
df = df.drop(columns=['quality'])  # keep only features + target

# 4) Train-test split (stratify because target can be imbalanced)
X = df.drop(columns='target')
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# 5) Imputation strategy
# We recommend KNN imputer when missingness is moderate and features are numeric.
imputer = KNNImputer(n_neighbors=5)

# 6) Preprocessing and model pipeline
pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# 7) Quick baseline: cross-validated score
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
print("CV ROC-AUC (baseline RF):", cv_scores.mean())

# 8) Grid search for hyperparams (keeps small for speed)
param_grid = {
    'clf__n_estimators': [100, 250],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}

gs = GridSearchCV(pipeline, param_grid, cv=4, scoring='roc_auc', n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("Best CV ROC-AUC:", gs.best_score_)

best_model = gs.best_estimator_

# 9) Optionally calibrate probabilities (helps confidence scores)
cal = CalibratedClassifierCV(best_model, cv='prefit', method='isotonic')
cal.fit(X_train, y_train)
final_model = cal

# 10) Evaluate on test set
X_test_prep = X_test.copy()
y_pred = final_model.predict(X_test_prep)
y_proba = final_model.predict_proba(X_test_prep)[:, 1]  # probability of 'good'

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 11) Save the final model and feature list
out_dir = Path("../models")
out_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(final_model, out_dir / "wine_quality_model.joblib")
joblib.dump(X.columns.tolist(), out_dir / "feature_names.joblib")
print("Model and feature names saved to", out_dir)

# 12) Save a small sample CSV explaining feature order (optional)
pd.DataFrame({'feature': X.columns}).to_csv(out_dir / "feature_list.csv", index=False)


FileNotFoundError: [Errno 2] No such file or directory: '../data/winequality-red.csv'