In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

In [2]:
# 讀取數據
data = pd.read_csv("data/train.csv")
data.drop(columns=["id"], inplace=True)

In [3]:
# One-Hot Encoding
data = pd.get_dummies(data, columns=["Gender"], drop_first=True)

In [4]:
# 標準化
std_scaler = StandardScaler()
data[["Age", "Height", "Weight"]] = std_scaler.fit_transform(data[["Age", "Height", "Weight"]])

In [5]:
# 0/1類型轉換
yes_no_mapping = {"yes":1, "no":0}
binary_features = ["family_history_with_overweight", "FAVC", "SMOKE", "SCC"]
for feature in binary_features:
    data[feature] = data[feature].map(yes_no_mapping)

In [6]:
# Label Encoding
CAEC_mapping = {"no": 0, "Sometimes": 1, "Frequently": 2, "Always": 3}
CALC_mapping = {"no": 0, "Sometimes": 1, "Frequently": 2}
NObeyesdad_mapping = {
    "Insufficient_Weight": 0, "Normal_Weight": 1, "Overweight_Level_I": 2,
    "Overweight_Level_II": 3, "Obesity_Type_I": 4, "Obesity_Type_II": 5, "Obesity_Type_III": 6
}
data["CAEC"] = data["CAEC"].map(CAEC_mapping)
data["CALC"] = data["CALC"].map(CALC_mapping)
data["NObeyesdad"] = data["NObeyesdad"].map(NObeyesdad_mapping)

In [7]:
# One-Hot Encoding for "MTRANS"
def one_hot_encode(data, field_name):
    field_value = data[field_name].value_counts().index
    new_field_name = []
    for value in field_value:
        fn = field_name + "_" + value
        data[fn] = (data[field_name] == value).astype('int64')
        new_field_name.append(fn)
    return new_field_name

one_hot_fields = one_hot_encode(data, "MTRANS")
data.drop(columns=["MTRANS"], inplace=True)

In [8]:
# 存處理後的 `train.pkl`
data.to_pickle("data/train.pkl")

In [9]:
# 存 StandardScaler, One-Hot Encoding 欄位, Label Encoding 規則
with open("data/preprocessing.pkl", "wb") as f:
    pickle.dump({"scaler": std_scaler, "one_hot_fields": one_hot_fields, 
                 "CAEC_mapping": CAEC_mapping, "CALC_mapping": CALC_mapping}, f)