# Đây là nơi để test các model và đưa ra kết quả cho từng model, tổng hợp toàn bộ chúng vào file submission để submit lên Kaggle nhằm kiểm chứng chi tiết từng model

## Khai báo thư viện:


In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import os

## Đọc file

In [48]:

train = pd.read_csv(r"C:\\Users\\hienm\\OneDrive\\Desktop\\Machine_Learning_2025-2026\\challange2\\data_afterfix\\train_featured.csv")
test = pd.read_csv(r"C:\\Users\\hienm\\OneDrive\\Desktop\\Machine_Learning_2025-2026\\challange2\\data_afterfix\\test_ready_for_model.csv")
test['Id'] = test['Id'].astype(int)


## Chia dữ liệu:

In [49]:

X = train.drop(["SalePrice", "SalePrice_log"], axis=1, errors='ignore')
y = np.log1p(train["SalePrice"])  # Kaggle chấm theo log(SalePrice)
X_test = test[X.columns]

print(" Dữ liệu sẵn sàng cho modeling!")

 Dữ liệu sẵn sàng cho modeling!


## K-Fold

In [50]:
sub_dir = r"C:\Users\hienm\OneDrive\Desktop\Machine_Learning_2025-2026\challange2\submission"
os.makedirs(sub_dir, exist_ok=True)

def run_model(model, model_name, X, y, X_test, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    print(f"\n=== {model_name.upper()} ===")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_tr, y_tr)
        val_pred = model.predict(X_val)
        oof_preds[val_idx] = val_pred
        test_preds += model.predict(X_test) / n_splits

        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        print(f"Fold {fold+1}: RMSE = {rmse:.5f}")

    cv_rmse = np.sqrt(mean_squared_error(y, oof_preds))
    print(f"→ {model_name} CV RMSE = {cv_rmse:.5f}")

    # --- Chuyển log về giá trị thật ---
    test_preds = np.expm1(test_preds)

    # --- Xử lý vô cực / NaN ---
    test_preds = np.where(np.isfinite(test_preds), test_preds, np.nan)
    test_preds = np.nan_to_num(test_preds, nan=np.nanmedian(test_preds))
    test_preds = np.clip(test_preds, 0, np.nanpercentile(test_preds, 99.5))

    # --- Xuất file submission ---
    submission = pd.DataFrame({
    'Id': test['Id'].astype(int),   # 🔧 ép kiểu về int
    'SalePrice': test_preds
})
    out_path = os.path.join(sub_dir, f"submission_{model_name.lower()}.csv")
    submission.to_csv(out_path, index=False)
    print(f"✅ Saved submission: {out_path}\n")
    return cv_rmse

## LinearRegression

In [51]:
# Drop any remaining object (string) columns
X_numeric = X.select_dtypes(exclude=['object'])
X_test_numeric = X_test.select_dtypes(exclude=['object'])

# Make sure both dataframes have the same columns
common_cols = set(X_numeric.columns) & set(X_test_numeric.columns)
X_numeric = X_numeric[list(common_cols)]
X_test_numeric = X_test_numeric[list(common_cols)]

linear_model = LinearRegression()
run_model(linear_model, "Linear", X_numeric, y, X_test_numeric)



=== LINEAR ===
Fold 1: RMSE = 0.10302
Fold 2: RMSE = 0.12927
Fold 3: RMSE = 0.13729
Fold 4: RMSE = 0.10237
Fold 5: RMSE = 0.11703
→ Linear CV RMSE = 0.11860
✅ Saved submission: C:\Users\hienm\OneDrive\Desktop\Machine_Learning_2025-2026\challange2\submission\submission_linear.csv



np.float64(0.11860028613208361)

## RandomForest

In [52]:
rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)
run_model(rf_model, "RandomForest", X_numeric, y, X_test_numeric)



=== RANDOMFOREST ===
Fold 1: RMSE = 0.12898
Fold 2: RMSE = 0.13058
Fold 3: RMSE = 0.16206
Fold 4: RMSE = 0.09660
Fold 5: RMSE = 0.13464
→ RandomForest CV RMSE = 0.13222
✅ Saved submission: C:\Users\hienm\OneDrive\Desktop\Machine_Learning_2025-2026\challange2\submission\submission_randomforest.csv



np.float64(0.13221600361872424)

## XGBoost

In [53]:
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method="hist"
)
# Use the numeric-only dataframes (created earlier) to avoid object dtype errors
run_model(xgb_model, "XGBoost", X_numeric, y, X_test_numeric)



=== XGBOOST ===
Fold 1: RMSE = 0.11325
Fold 2: RMSE = 0.13012
Fold 3: RMSE = 0.14234
Fold 4: RMSE = 0.09844
Fold 5: RMSE = 0.11562
→ XGBoost CV RMSE = 0.12089
✅ Saved submission: C:\Users\hienm\OneDrive\Desktop\Machine_Learning_2025-2026\challange2\submission\submission_xgboost.csv



np.float64(0.12088606585097937)

## Gradient_Boosting

In [54]:
gb_model = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)
run_model(gb_model, "GradientBoost", X_numeric, y, X_test_numeric)



=== GRADIENTBOOST ===
Fold 1: RMSE = 0.12170
Fold 2: RMSE = 0.12541
Fold 3: RMSE = 0.14715
Fold 4: RMSE = 0.09964
Fold 5: RMSE = 0.11896
→ GradientBoost CV RMSE = 0.12351
✅ Saved submission: C:\Users\hienm\OneDrive\Desktop\Machine_Learning_2025-2026\challange2\submission\submission_gradientboost.csv



np.float64(0.12350681305305346)

## Ridge_Regression

In [55]:
ridge_model = Ridge(alpha=10)
run_model(ridge_model, "Ridge", X_numeric, y, X_test_numeric)



=== RIDGE ===
Fold 1: RMSE = 0.10814
Fold 2: RMSE = 0.11594
Fold 3: RMSE = 0.13720
Fold 4: RMSE = 0.09571
Fold 5: RMSE = 0.11090
→ Ridge CV RMSE = 0.11438
✅ Saved submission: C:\Users\hienm\OneDrive\Desktop\Machine_Learning_2025-2026\challange2\submission\submission_ridge.csv



np.float64(0.11437949351840948)

## Lasso_Regression

In [56]:
lasso_model = Lasso(alpha=0.0005, max_iter=10000)
run_model(lasso_model, "Lasso", X_numeric, y, X_test_numeric)



=== LASSO ===


  model = cd_fast.enet_coordinate_descent(


Fold 1: RMSE = 0.10859


  model = cd_fast.enet_coordinate_descent(


Fold 2: RMSE = 0.11732


  model = cd_fast.enet_coordinate_descent(


Fold 3: RMSE = 0.13739
Fold 4: RMSE = 0.09662
Fold 5: RMSE = 0.11148
→ Lasso CV RMSE = 0.11505
✅ Saved submission: C:\Users\hienm\OneDrive\Desktop\Machine_Learning_2025-2026\challange2\submission\submission_lasso.csv



  model = cd_fast.enet_coordinate_descent(


np.float64(0.11505338644867297)

## ElasticNet Regression

In [57]:
elastic_model = ElasticNet(alpha=0.0005, l1_ratio=0.7, max_iter=10000)
run_model(elastic_model, "ElasticNet", X_numeric, y, X_test_numeric)



=== ELASTICNET ===


  model = cd_fast.enet_coordinate_descent(


Fold 1: RMSE = 0.10829


  model = cd_fast.enet_coordinate_descent(


Fold 2: RMSE = 0.11608


  model = cd_fast.enet_coordinate_descent(


Fold 3: RMSE = 0.13690
Fold 4: RMSE = 0.09667
Fold 5: RMSE = 0.11095
→ ElasticNet CV RMSE = 0.11453
✅ Saved submission: C:\Users\hienm\OneDrive\Desktop\Machine_Learning_2025-2026\challange2\submission\submission_elasticnet.csv



  model = cd_fast.enet_coordinate_descent(


np.float64(0.11453356786134798)