In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import root_mean_squared_log_error
from sklearn.base import BaseEstimator, TransformerMixin

from xgboost import XGBRegressor
import xgboost as xgb

In [23]:
data = pd.read_csv("data/playground-series-s5e5/train.csv", index_col = "id")
data.columns = data.columns.str.lower()

In [24]:
data["bmi"] = data.weight / ((data.height / 100) ** 2)

In [25]:
data["age_group"] = pd.cut(
    data["age"],
    bins=[20, 30, 40, 50, 60, 80],
    labels=["20-29", "30-39", "40-49", "50-59", "60-79"],
    right=False  # Optional: whether intervals are closed on the right
)


In [26]:
X = data.drop(columns = ["calories"])
y = data.calories

In [27]:
import mlflow

# Set experiment name (this will create it if it doesn't exist)
mlflow.set_experiment("Burned Calories Prediction")

<Experiment: artifact_location='file:///home/mshopov/Jupyter%20Lab%20Notebooks/Kaggle%20Competitions/Predict%20Calorie%20Expenditure/mlruns/287292542223627452', creation_time=1747153771249, experiment_id='287292542223627452', last_update_time=1747153771249, lifecycle_stage='active', name='Burned Calories Prediction', tags={}>

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 10_000, random_state = 17)

In [33]:
#categorical_features = ["sex"]
#numeric_features = ["age", "height", "weight", "duration", "heart_rate", "body_temp"]

categorical_features = ["sex", "age_group"]
numeric_features = ["age", "height", "weight", "duration", "heart_rate", "body_temp", "bmi"]

# Preprocessing
preprocessor = ColumnTransformer([
    ("log", FunctionTransformer(np.log1p), numeric_features),
    ("cat", OneHotEncoder(drop="first"), categorical_features)
])

poly_featurer = PolynomialFeatures(degree = 2)

scaler = MinMaxScaler()

xgb_model = XGBRegressor(
    objective = "reg:squaredlogerror",
    #n_estimators = 150,
    #min_child_weight = 0.01,
    #learning_rate = 0.1,
    #reg_alpha = 0,
    #max_depth = 11,
    #reg_lambda = 0
)    

# Define pipeline
model_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("poly_features", poly_featurer),
    ("scaler", scaler),
    ("xgb", xgb_model)
])

In [32]:
# --- FOR SINGLE MODELS ---

with mlflow.start_run():
    # Fit the model
    model_pipeline.fit(X_train, y_train)

    # Predictions
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)

    # Custom RMSLE metric
    #def root_mean_squared_log_error(y_true, y_pred):
    #    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

    train_rmsle = root_mean_squared_log_error(y_train, y_train_pred)
    test_rmsle = root_mean_squared_log_error(y_test, y_test_pred)

    # Log parameters
    xgb_params = xgb_model.get_params()
    for param, value in xgb_params.items():
        mlflow.log_param(param, value)
    
    mlflow.log_param("model_type", "XGBRegressor")
    mlflow.log_param("scaling", "MinMaxScaler")
    mlflow.log_param("log1p_transform", "numeric")
    mlflow.log_param("poly_feat", "degree_2")

    mlflow.log_param("new_columns", "age_group_and_bmi")
    
    # Log metrics
    mlflow.log_metric("train_rmsle", train_rmsle)
    mlflow.log_metric("test_rmsle", test_rmsle)

    # Log model
    mlflow.sklearn.log_model(model_pipeline, "model_pipeline")

    print(f"Train RMSLE: {train_rmsle:.8f}")
    print(f"Test RMSLE: {test_rmsle:.8f}")



Train RMSLE: 0.05101862
Test RMSLE: 0.06014101


In [36]:
# --- FOR RANDOMIZED SEARCH ---

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

# Create the custom scorer
rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False)

"""
# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'xgb__max_depth': [9, 10, 11, 12, 13],  # Try various tree depths
    'xgb__learning_rate': [0.09, 0.095, 0.1, 0.105, 0.11],  # Various learning rates
    'xgb__n_estimators': [140, 145, 150, 155, 160],  # Number of boosting rounds
        # NEW: controls overfitting and minimum data to split
    'xgb__min_child_weight': [0.009, 0.0095, 0.010, 0.0105, 0.011],    
    # NEW: reduce regularization to allow overfitting
    'xgb__reg_alpha': [0, 0.01, 0.1],
    'xgb__reg_lambda': [0, 0.01, 0.1],    
}

"""

# RSCV after feature engineering
param_dist = {
    "xgb__max_depth": [4, 6, 8, 10, 12, 14, 20],  # Add lower depths to avoid overfitting if new features help
    "xgb__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],  # Allow exploration of slower/faster learning
    "xgb__n_estimators": [100, 150, 200, 250, 300, 350],  # Broader range for boosting rounds
    "xgb__min_child_weight": [0.01, 0.1, 1, 3, 5, 7],  # Encourage robustness with higher values
    "xgb__reg_alpha": [0, 0.01, 0.05, 0.1, 1, 10],  # Add stronger regularization options
    "xgb__reg_lambda": [0, 0.01, 0.1, 1, 10, 15],  # Same here
}


# Create RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_dist,
    n_iter=250,  # Number of parameter combinations to try
    scoring=rmsle_scorer,  # Use RMSE to evaluate performance
    cv=3,  # 3-fold cross-validation
    verbose=2,  # Display progress
    n_jobs=10,  # Use all available CPUs for parallel processing
    random_state=17  # Set random state for reproducibility
)

# Fit the model with RandomizedSearchCV
with mlflow.start_run():
    random_search.fit(X_train, y_train)
    
    # Get the best model
    best_model = random_search.best_estimator_

    # Predictions
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)

    # Custom RMSLE metric
    train_rmsle = root_mean_squared_log_error(y_train, y_train_pred)
    test_rmsle = root_mean_squared_log_error(y_test, y_test_pred)

    # Log parameters
    best_params = random_search.best_params_
    for param, value in best_params.items():
        mlflow.log_param(param, value)
    
    mlflow.log_param("model_type", "XGBRegressor")
    mlflow.log_param("scaling", "MinMaxScaler")
    mlflow.log_param("log_transform", "numeric")
    mlflow.log_param("poly_feat", "degree_2")

    mlflow.log_param("cv", "3")

    mlflow.log_param("new_columns", "age_group_and_bmi")
    # Log metrics
    mlflow.log_metric("train_rmsle", train_rmsle)
    mlflow.log_metric("test_rmsle", test_rmsle)

    # Log model
    mlflow.sklearn.log_model(best_model, "model_pipeline")

    print(f"Best parameters: {best_params}")
    print(f"Train RMSLE: {train_rmsle:.6f}")
    print(f"Test RMSLE: {test_rmsle:.6f}")

Fitting 3 folds for each of 250 candidates, totalling 750 fits


Traceback (most recent call last):
  File "/home/mshopov/anaconda3/envs/deep-learning-env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mshopov/anaconda3/envs/deep-learning-env/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mshopov/anaconda3/envs/deep-learning-env/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mshopov/anaconda3/envs/deep-learning-env/lib/python3.12/site-packages/sklearn/utils/_param_v

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

---

## Submissions

In [62]:
# --- FOR SINGLE MODELS ---

root_mean_squared_log_error(y_train, model_pipeline.predict(X_train))

0.05138745007474449

In [63]:
root_mean_squared_log_error(y_test, model_pipeline.predict(X_test))

0.05981686736686766

In [64]:
test_error = root_mean_squared_log_error(y_test, model_pipeline.predict(X_test))

In [23]:
# --- FOR RANDOMIZED SEARCH ---

root_mean_squared_log_error(y_train, best_model.predict(X_train))

0.0531474164475391

In [21]:
root_mean_squared_log_error(y_test, best_model.predict(X_test))

0.06005181835134728

In [None]:
test_error = root_mean_squared_log_error(y_test, model_pipeline.predict(X_test))

---

In [24]:
eval_data = pd.read_csv("data/playground-series-s5e5/test.csv", index_col = "id")

In [27]:
eval_data.columns = eval_data.columns.str.lower()

In [65]:
# --- FOR SINGLE MODELS ---

submission = pd.DataFrame({
    "id": eval_data.index,
    "Calories": model_pipeline.predict(eval_data)
})
submission

Unnamed: 0,id,Calories
0,750000,28.113632
1,750001,107.521530
2,750002,87.097160
3,750003,126.426750
4,750004,75.703194
...,...,...
249995,999995,25.969122
249996,999996,9.564373
249997,999997,72.678871
249998,999998,169.230057


In [34]:
# --- FOR RANDOMIZED SEARCH ---

submission = pd.DataFrame({
    "id": eval_data.index,
    "Calories": best_model.predict(eval_data)
})
submission

Unnamed: 0,id,Calories
0,750000,27.338148
1,750001,108.317604
2,750002,86.876556
3,750003,126.816238
4,750004,76.310501
...,...,...
249995,999995,26.059559
249996,999996,9.439581
249997,999997,72.856796
249998,999998,168.550751


In [69]:
import os

# Find the lowest available index X
existing_files = os.listdir("submissions")
used_indices = set()

for filename in existing_files:
    if filename.startswith("submission_") and filename.endswith(".csv"):
        try:
            index = int(filename[len("submission_"):-len(".csv")])
            used_indices.add(index)
        except ValueError:
            pass  # skip files with non-integer suffixes

# Find the lowest unused index
X = 0
while X in used_indices:
    X += 1

# Save the DataFrame
file_path = f"submissions/submission_{X}___{str(test_error).replace(".", "_")}.csv"
submission.to_csv(file_path, index=False)

print(f"Saved to {file_path}")

Saved to submissions/submission_1___0_05981686736686766.csv
