# Exercise 2.2.
# Wine Quality Data Set: "data/wines.csv"
# source: https://archive.ics.uci.edu/ml/datasets/wine+quality
# The file contains data on samples of white and red Portuguese wine
# Vinho Verde.
# Various physico-chemical characteristics of individual samples
# are available as well as wine quality scores on a point scale (0-10)
# made by specialists.

# Re-run your best models for all algorithms for 5-fold CV.
# Check the stability of results for repeated K-fold
# Check in repeated k-fold CV if adding stratification changes your results (stability).
# Compare the effect of stratification with titanic problem.
# Check if you didnt overfit in your models. Check if you can imrpove you validation score.


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, RepeatedKFold, RepeatedStratifiedKFold, train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [8]:
df = pd.read_csv('/mnt/wines.csv')
df = df.dropna()
df = pd.get_dummies(df, columns=["type"], drop_first=True, dtype='uint8')
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,type_white
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,0
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,1
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,1
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,1
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,1


In [9]:
X = df.drop(columns=['quality'])
y = df['quality']

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # scaling
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [11]:
log_reg = LogisticRegression(max_iter=2000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

models = {
    "Logistic Regression": log_reg,
    "Random Forest": rf_clf
}

In [17]:
for name, model in models.items():
    print(f"{name}")
    scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
    print(f"Normal 5-Fold Accuracy: {scores.mean():.4f} (Std: {scores.std() * 2:.4f})")
    kf_shuffled = KFold(n_splits=5, shuffle=True, random_state=42)
    scores_shuffled = cross_val_score(model, X_scaled, y, cv=kf_shuffled, scoring='accuracy')
    print(f"Shuffled 5-Fold Accuracy: {scores_shuffled.mean():.4f} (Std: {scores_shuffled.std() * 2:.4f})")

Logistic Regression
Normal 5-Fold Accuracy: 0.4970 (Std: 0.0894)
Shuffled 5-Fold Accuracy: 0.5438 (Std: 0.0168)
Random Forest
Normal 5-Fold Accuracy: 0.4815 (Std: 0.1096)
Shuffled 5-Fold Accuracy: 0.6826 (Std: 0.0159)


In [18]:
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)

for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=rkf, scoring='accuracy')
    print(f"{name} - Repeated K-Fold Accuracy: {scores.mean():.4f} (Std: {scores.std():.4f})")

Logistic Regression - Repeated K-Fold Accuracy: 0.5431 (Std: 0.0094)
Random Forest - Repeated K-Fold Accuracy: 0.6881 (Std: 0.0128)


In [19]:
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=rskf, scoring='accuracy')
    print(f"{name} - Repeated Stratified Accuracy: {scores.mean():.4f} (Std: {scores.std():.4f})")

Logistic Regression - Repeated Stratified Accuracy: 0.5436 (Std: 0.0096)
Random Forest - Repeated Stratified Accuracy: 0.6907 (Std: 0.0087)


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

for name, model in models.items():
    print(f"\nChecking {name}...")
    model.fit(X_train, y_train)

    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)

    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy:  {test_acc:.4f}")
    if train_acc == 1:
      print("OVERFITTING!!")
      break
    if train_acc > test_acc + 0.05:
        print("Could be overfitting")
    else:
        print("No significant overfitting")


Checking Logistic Regression...
Train Accuracy: 0.5494
Test Accuracy:  0.5415
No significant overfitting

Checking Random Forest...
Train Accuracy: 1.0000
Test Accuracy:  0.6669
OVERFITTING!!
