## 1 Preliminaries

## 1.1 Imports and Configurations

### Imports

In [1]:
import random
import numpy as np
import pandas as pd

from pathlib import Path
from typing import List, Dict, Optional, Union
from pandas.api.types import CategoricalDtype

from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

### Constants

In [2]:
# 1 Preliminaries

RANDOM_STATE = 42

FINAL_TRAIN_PATH = Path("../data/final/train.csv")
FINAL_TEST_PATH = Path("../data/final/test.csv")
FINAL_SUBMISSION_PATH = Path("../data/final/submission.csv")

FEATURES_CAT = [
    "Sex",
    "Is_Fever",
    "Is_Short_Session",
]
FEATURE_ORD_LEVELS = {
    "Age_Group": ["until25", "25-40", "40-60", "60-100"],
    "Heart_Rate_Group": ["low", "normal", "high"],
    "Duration_Group": ["short", "medium", "long", "very_long"],
}

### Configs

In [3]:
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

## 1.2 Data Preprocessing

In [4]:
def read_data(path: Union[str, Path]) -> pd.DataFrame:
    df = pd.read_csv(path, index_col=0)
    return df

In [5]:
def encode(
    df: pd.DataFrame,
    features_cat: Optional[List[str]] = None,
    features_date: Optional[List[str]] = None,
    feature_ord_levels: Optional[Dict[str, List[str]]] = None,
    format: str = "%d-%m-%Y",
) -> pd.DataFrame:

    def encode_cat(df: pd.DataFrame, features_cat: List[str]) -> pd.DataFrame:
        for feature in features_cat:
            if feature in df.columns:
                df[feature] = df[feature].astype("category")
        return df

    def encode_date(df: pd.DataFrame, features_date: List[str]) -> pd.DataFrame:
        for feature in features_date:
            if feature in df.columns:
                df[feature] = pd.to_datetime(
                    df[feature], format=format, errors="coerce"
                )
        return df

    def encode_ord(
        df: pd.DataFrame, feature_ord_levels: Dict[str, List[str]]
    ) -> pd.DataFrame:
        for feature, levels in feature_ord_levels.items():
            if feature in df.columns:
                cat_type = CategoricalDtype(categories=levels, ordered=True)
                df[feature] = df[feature].astype(str).astype(cat_type)
        return df

    if features_cat:
        df = encode_cat(df, features_cat)

    if features_date:
        df = encode_date(df, features_date)

    if feature_ord_levels:
        df = encode_ord(df, feature_ord_levels)

    return df

In [6]:
def preprocess_data(
    path: str,
    features_cat: Optional[List[str]] = None,
    features_date: Optional[List[str]] = None,
    feature_ord_levels: Optional[Dict[str, List[str]]] = None,
) -> pd.DataFrame:

    df = read_data(path)

    df = encode(
        df,
        features_cat=features_cat or [],
        features_date=features_date or [],
        feature_ord_levels=feature_ord_levels or {},
        format="%d/%m/%Y",
    )

    return df

In [7]:
df_train = preprocess_data(
    path=FINAL_TRAIN_PATH,
    features_cat=FEATURES_CAT,
    feature_ord_levels=FEATURE_ORD_LEVELS,
)

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 747102 entries, 0 to 749999
Data columns (total 26 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Age                   747102 non-null  int64   
 1   Height                747102 non-null  float64 
 2   Weight                747102 non-null  float64 
 3   Duration              747102 non-null  float64 
 4   Heart_Rate            747102 non-null  float64 
 5   Body_Temp             747102 non-null  float64 
 6   Calories              747102 non-null  float64 
 7   Age_Group             747102 non-null  category
 8   Heart_Rate_Group      747102 non-null  category
 9   Duration_Group        747102 non-null  category
 10  Body_Temp_Deviation   747102 non-null  float64 
 11  Is_Fever              747102 non-null  category
 12  Heart_Rate_Deviation  747102 non-null  float64 
 13  Log_Duration          747102 non-null  float64 
 14  Is_Short_Session      747102 non-null  ca

In [9]:
df_test = preprocess_data(
    path=FINAL_TEST_PATH,
    features_cat=FEATURES_CAT,
    feature_ord_levels=FEATURE_ORD_LEVELS,
)

In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250000 entries, 750000 to 999999
Data columns (total 25 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Age                   250000 non-null  int64   
 1   Height                250000 non-null  float64 
 2   Weight                250000 non-null  float64 
 3   Duration              250000 non-null  float64 
 4   Heart_Rate            250000 non-null  float64 
 5   Body_Temp             250000 non-null  float64 
 6   Age_Group             250000 non-null  category
 7   Heart_Rate_Group      250000 non-null  category
 8   Duration_Group        250000 non-null  category
 9   Body_Temp_Deviation   250000 non-null  float64 
 10  Is_Fever              250000 non-null  category
 11  Heart_Rate_Deviation  250000 non-null  float64 
 12  Log_Duration          250000 non-null  float64 
 13  Is_Short_Session      250000 non-null  category
 14  Log_Weight            250000 non-nul

## 2 Model Trainig

## 2.1 Trees

### Split Data

In [11]:
X_train = df_train.drop(columns=["Calories"])
y_train = df_train["Calories"]
y_train_log = np.log1p(y_train)

### Preprocess Pipeline

In [12]:
def encode_categories_as_int(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    for col in X.select_dtypes(include='category').columns:
        X[col] = X[col].cat.codes
    return X

In [13]:
def get_tree_pipeline(df: pd.DataFrame) -> Pipeline:
    cat_cols: List[str] = df.select_dtypes(include="category").columns.tolist()
    num_cols: List[str] = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

    cat_pipeline = Pipeline([
        ("cat_codes", FunctionTransformer(encode_categories_as_int))
    ])

    num_pipeline = FunctionTransformer(func=lambda x: x, validate=False)

    preprocessor = ColumnTransformer([
        ("cat", cat_pipeline, cat_cols),
        ("num", num_pipeline, num_cols)
    ])

    pipeline = Pipeline([
        ("preprocessor", preprocessor)
    ])

    return pipeline

In [14]:
tree_pipeline = get_tree_pipeline(X_train)

In [15]:
X_train_encoded = tree_pipeline.fit_transform(X_train)
X_test_encoded = tree_pipeline.transform(df_test)

### Model Initialization

In [16]:
xgb_regressor = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

### Cross Validation

In [17]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

scores = cross_val_score(
    xgb_regressor,
    X_train_encoded,
    y_train_log,
    scoring="neg_root_mean_squared_error",
    cv=cv,
)

In [18]:
print(f"RMSLE: {-scores.mean():.5f} ± {scores.std():.5f}")

RMSLE: 0.06886 ± 0.00034


In [19]:
final_xgb_regressor = xgb_regressor.fit(X_train_encoded, y_train_log)

### Submission

In [20]:
def generate_submission(
    model,
    df_encoded: pd.DataFrame,
    original_df: pd.DataFrame,
    filename: str = "submission.csv",
) -> pd.DataFrame:
    y_pred_log = model.predict(df_encoded)
    y_pred = np.expm1(y_pred_log)

    submission_df = pd.DataFrame(
        {"id": original_df.index.values, "Calories": y_pred}
    )

    submission_df.to_csv(filename, index=False)
    print(f"✔ Submission saved to: {filename}")

    return submission_df

In [21]:
submission = generate_submission(
    model=final_xgb_regressor,
    df_encoded=X_test_encoded,
    original_df=df_test,
    filename=FINAL_SUBMISSION_PATH
)

✔ Submission saved to: ../data/final/submission.csv
