In [None]:
#0. Thêm thư viện
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from scipy import stats
import random

In [None]:
# 1. Load và Tiền xử lý
df = pd.read_csv('titanic.csv')[['survived', 'pclass', 'sex', 'age', 'fare']]
df['age'] = df['age'].fillna(df['age'].median())
df['fare'] = df['fare'].fillna(df['fare'].median())
formula = 'survived ~ pclass * sex + age + fare'

In [None]:
# 2. Phân phối AUC 100 lần (Stability & Randomness)
scores = []
for k in range(100):
    train, test = train_test_split(df, test_size=0.3, stratify=df.survived,
                                   random_state=random.randint(0, 10000))
    res = sm.GLM.from_formula(formula, data=train, family=sm.families.Binomial()).fit()
    scores.append(roc_auc_score(test.survived, res.predict(test)))

df_res = pd.DataFrame(scores, columns=['scores'])
print(f"Phân phối AUC (100 lần): Mean={df_res.scores.mean():.4f}, Std={df_res.scores.std():.4f}")
print(f"Skewness: {df_res.scores.skew():.4f}, Kurtosis: {df_res.scores.kurtosis():.4f}")

Phân phối AUC (100 lần): Mean=0.8329, Std=0.0193
Skewness: -0.0085, Kurtosis: -0.0345


In [None]:
# 3. So sánh Stability: Normal K-Fold vs Stratified K-Fold
def check_cv(method, name):
    stabs = []
    for z in range(5): # lặp lại 5 lần để kiểm tra độ ổn định
        val_res = []
        for train_idx, test_idx in method.split(df, df.survived if "Stratified" in name else None):
            train, test = df.iloc[train_idx], df.iloc[test_idx]
            model = sm.GLM.from_formula(formula, data=train, family=sm.families.Binomial()).fit()
            val_res.append(roc_auc_score(test.survived, model.predict(test)))
        stabs.append(np.mean(val_res))
    print(f"{name} AUC: {np.mean(stabs):.4f} | Độ biến thiên giữa các lần chạy (Std): {np.std(stabs):.6f}")

check_cv(KFold(n_splits=5, shuffle=True, random_state=42), "Normal K-Fold")
check_cv(StratifiedKFold(n_splits=5, shuffle=True, random_state=42), "Stratified K-Fold")

Normal K-Fold AUC: 0.8356 | Độ biến thiên giữa các lần chạy (Std): 0.000000
Stratified K-Fold AUC: 0.8335 | Độ biến thiên giữa các lần chạy (Std): 0.000000


In [None]:
# 4. Kiểm tra Overfitting (Train AUC vs Valid AUC)
kf_final = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_final, valid_final = [], []
for train_idx, test_idx in kf_final.split(df, df.survived):
    train, test = df.iloc[train_idx], df.iloc[test_idx]
    res = sm.GLM.from_formula(formula, data=train, family=sm.families.Binomial()).fit()
    train_final.append(roc_auc_score(train.survived, res.predict(train)))
    valid_final.append(roc_auc_score(test.survived, res.predict(test)))

print(f"\nKiểm tra Overfit: Train AUC ({np.mean(train_final):.4f}) vs Valid AUC ({np.mean(valid_final):.4f})")


Kiểm tra Overfit: Train AUC (0.8371) vs Valid AUC (0.8335)
