In [101]:
import pandas as pd
import pickle
import numpy as np
# 讀取測試數據
test_data = pd.read_csv("data/test.csv")

In [102]:
# **保存原始的 "id" 欄位
test_ids = test_data["id"] if "id" in test_data.columns else range(len(test_data))

In [103]:
# 測試數據包含 "id" 欄位，先移除
test_data.drop(columns=["id"], inplace=True)

In [104]:
# 讀取 "preprocessing.pkl"
with open("data/preprocessing.pkl", "rb") as f:
    preprocessor = pickle.load(f)

In [105]:
# 從preprocessing.pkl取得
# "StandardScaler", "one_hot_fields", "CAEC_mapping", "CALC_mapping"
std_scaler = preprocessor["scaler"]
one_hot_fields = preprocessor["one_hot_fields"]  # train.csv 的 One-Hot 欄位
CAEC_mapping = preprocessor["CAEC_mapping"]
CALC_mapping = preprocessor["CALC_mapping"]

In [106]:
# One-Hot Encoding "Gender"
test_data = pd.get_dummies(test_data, columns=["Gender"], drop_first=True)

In [107]:
# 標準化 ("Age", "Height", "Weight")
test_data[["Age", "Height", "Weight"]] = std_scaler.transform(test_data[["Age", "Height", "Weight"]])

In [108]:
# 0/1 類型轉換
yes_no_mapping = {"yes": 1, "no": 0}
binary_features = ["family_history_with_overweight", "FAVC", "SMOKE", "SCC"]
for feature in binary_features:
    test_data[feature] = test_data[feature].map(yes_no_mapping).astype(int)

In [109]:
# Label Encoding
test_data["CAEC"] = test_data["CAEC"].map(CAEC_mapping)
test_data["CALC"] = test_data["CALC"].map(CALC_mapping)

In [110]:
# One-Hot Encoding for "MTRANS"**
def one_hot_encode(data, field_name, known_values):
    """確保 test.csv 的 One-Hot 欄位與 train.csv 一致"""
    for value in known_values:
        new_col = f"{field_name}_{value}"
        data[new_col] = (data[field_name] == value).astype(int)
    return data

# 取得 train.csv 中 MTRANS 的所有類別
MTRANS_categories = [col.replace("MTRANS_", "") for col in one_hot_fields]
test_data = one_hot_encode(test_data, "MTRANS", MTRANS_categories)

# 刪除原始的 MTRANS 欄位
test_data.drop(columns=["MTRANS"], inplace=True)

In [111]:
test_data.isnull().sum()

Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
Gender_Male                       0
MTRANS_Public_Transportation      0
MTRANS_Automobile                 0
MTRANS_Walking                    0
MTRANS_Motorbike                  0
MTRANS_Bike                       0
dtype: int64

In [32]:
# 確保所有數據型態為 float32 或 int 
test_data = test_data.astype(np.float32)  # XGBoost 只接受 float32 或 int ，但 bool也可

In [33]:
# 模型位置跟名稱請依照上一個階段，儲存的時候路徑與命名

In [112]:
# 載入 XGBoost 模型
with open("./model/250302_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [113]:
# 進行測試集預測
predictions = loaded_model.predict(test_data)

In [114]:
# 預測結果，為一個 numpy array
predictions

array([5, 2, 6, ..., 0, 1, 5], dtype=int64)

In [115]:
# 轉換 NObeyesdad 類別回原始名稱
NObeyesdad_mapping = {
    0: "Insufficient_Weight", 
    1: "Normal_Weight", 
    2: "Overweight_Level_I",
    3: "Overweight_Level_II", 
    4: "Obesity_Type_I", 
    5: "Obesity_Type_II", 
    6: "Obesity_Type_III"
}
# 轉換成pandas使用 map 方法將數值預測結果轉換為對應的體重類別名稱
predictions = pd.Series(predictions).map(NObeyesdad_mapping).tolist()

In [116]:
# 儲存最終 submission.csv
submission = pd.DataFrame({"id": test_ids, "NObeyesdad": predictions})
submission.to_csv("submission.csv", index=False)
print("結果儲存至 submission.csv")

結果儲存至 submission.csv
