In [1]:
#0. Thêm thư viện
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import random

In [2]:
# 1. Load và Tiền xử lý
wines = pd.read_csv('wines.csv')
# Tạo biến mục tiêu nhị phân: 1 nếu quality > 5 (Rượu tốt), ngược lại 0
wines['good_wine'] = (wines['quality'] > 5).astype(int)
# Mã hóa loại rượu: white=1, red=0
wines['is_white'] = (wines['type'] == 'white').astype(int)

# Các biến độc lập chính
cols = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
        'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
        'pH', 'sulphates', 'alcohol', 'is_white']
formula = 'good_wine ~ ' + ' + '.join(cols)

In [3]:
# 2. Phân phối AUC 100 lần (Stability & Randomness)
scores_100 = []
for _ in range(100):
    train, test = train_test_split(wines, test_size=0.3, stratify=wines.good_wine,
                                   random_state=random.randint(0, 10000))
    model = sm.GLM.from_formula(formula, data=train, family=sm.families.Binomial()).fit()
    scores_100.append(roc_auc_score(test.good_wine, model.predict(test)))

print(f"Độ ổn định (100 lần chạy): Mean AUC = {np.mean(scores_100):.4f}, Std = {np.std(scores_100):.4f}")

Độ ổn định (100 lần chạy): Mean AUC = 0.8025, Std = 0.0085


In [4]:
# 3. So sánh Stability: Normal K-Fold vs Stratified K-Fold
def compare_cv(method, name):
    stabs = []
    for r in range(5): # lặp lại 5 lần để kiểm tra độ ổn định
        fold_scores = []
        for tr_idx, te_idx in method.split(wines, wines.good_wine if 'Stratified' in name else None):
            train, test = wines.iloc[tr_idx], wines.iloc[te_idx]
            model = sm.GLM.from_formula(formula, data=train, family=sm.families.Binomial()).fit()
            fold_scores.append(roc_auc_score(test.good_wine, model.predict(test)))
        stabs.append(np.mean(fold_scores))
    return np.mean(stabs), np.std(stabs)

mean_k, std_k = compare_cv(KFold(n_splits=5, shuffle=True, random_state=42), "K-Fold")
mean_s, std_s = compare_cv(StratifiedKFold(n_splits=5, shuffle=True, random_state=42), "Stratified K-Fold")

print(f"K-Fold: Mean AUC = {mean_k:.4f}, Std = {std_k:.6f}")
print(f"Stratified K-Fold: Mean AUC = {mean_s:.4f}, Std = {std_s:.6f}")

K-Fold: Mean AUC = 0.8027, Std = 0.000000
Stratified K-Fold: Mean AUC = 0.8029, Std = 0.000000


In [5]:
# 4. Kiểm tra Overfitting (Train AUC vs Valid AUC)
improved_formula = 'good_wine ~ volatile_acidity + residual_sugar + total_sulfur_dioxide + density + pH + sulphates + alcohol + is_white + alcohol:volatile_acidity + alcohol:residual_sugar'

train_final, valid_final = [], []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for tr_idx, te_idx in skf.split(wines, wines.good_wine):
    train, test = wines.iloc[tr_idx], wines.iloc[te_idx]
    res = sm.GLM.from_formula(improved_formula, data=train, family=sm.families.Binomial()).fit()
    train_final.append(roc_auc_score(train.good_wine, res.predict(train)))
    valid_final.append(roc_auc_score(test.good_wine, res.predict(test)))

print(f"\nKiểm tra Overfit (Mô hình cải tiến): Train AUC ({np.mean(train_final):.4f}) vs Valid AUC ({np.mean(valid_final):.4f})")


Kiểm tra Overfit (Mô hình cải tiến): Train AUC (0.8026) vs Valid AUC (0.8016)
