In [12]:
# 📦 1. Import Libraries
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import os
import tempfile
import shutil

In [13]:
# 📥 2. Load Datasets
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, ".."))
TRAIN_PATH = os.path.join(PROJECT_ROOT, "data", "Train.csv")
TEST_PATH = os.path.join(PROJECT_ROOT, "data", "Test.csv")

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
test_ids = test_df["ID"]

In [14]:
# 🎯 3. Clip + Log Target
train_df["total_cost"] = train_df["total_cost"].clip(upper=25000000)
train_df["log_total_cost"] = np.log1p(train_df["total_cost"])

In [15]:
# 🌍 4. Region Mapping
region_map = {
    'KENYA': 'Africa', 'TANZANIA': 'Africa', 'UGANDA': 'Africa', 'ETHIOPIA': 'Africa',
    'UNITED STATES OF AMERICA': 'North America', 'CANADA': 'North America',
    'GERMANY': 'Europe', 'FRANCE': 'Europe', 'UNITED KINGDOM': 'Europe', 'ITALY': 'Europe',
    'CHINA': 'Asia', 'JAPAN': 'Asia', 'INDIA': 'Asia',
}
default_region = 'Other'

In [16]:
# 🧠 5. Feature Engineering
age_map = {'0-24': 1, '25-34': 2, '35-44': 3, '45-54': 4, '55-64': 5, '65+': 6}
def engineer_features(df):
    df["region"] = df["country"].map(region_map).fillna(default_region)
    df["total_people"] = df["total_female"].fillna(0) + df["total_male"].fillna(0)
    df["total_nights"] = df["night_mainland"] + df["night_zanzibar"]
    df["has_spouse"] = df["travel_with"].apply(lambda x: 1 if x == "Spouse" else 0)
    df["has_children"] = df["travel_with"].apply(lambda x: 1 if x == "Children" else 0)
    df["people_x_nights"] = df["total_people"] * df["total_nights"]
    df["solo_traveler"] = df["total_people"].apply(lambda x: 1 if x == 1 else 0)
    df["age_group_num"] = df["age_group"].map(age_map).fillna(0).astype(int)
    df["people_x_mainland"] = df["total_people"] * df["night_mainland"]
    df["people_x_zanzibar"] = df["total_people"] * df["night_zanzibar"]
    df["nights_per_person"] = df["total_nights"] / (df["total_people"] + 1e-6)

    for col in ["total_female", "total_male"]:
        df[col] = df[col].fillna(0)
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

train_df = engineer_features(train_df)
test_df = engineer_features(test_df)


In [17]:
# 🏷️ 6. Feature Selection
features = [
    'region', 'age_group', 'travel_with', 'purpose', 'main_activity',
    'tour_arrangement', 'package_transport_int', 'package_accomodation', 'package_food',
    'package_transport_tz', 'package_sightseeing', 'package_guided_tour',
    'package_insurance', 'payment_mode',
    'total_female', 'total_male', 'night_mainland', 'night_zanzibar',
    'total_people', 'total_nights', 'has_spouse', 'has_children',
    'people_x_nights', 'solo_traveler', 'age_group_num',
    'nights_per_person', 'people_x_mainland', 'people_x_zanzibar'
]

X = train_df[features]
y = train_df["log_total_cost"]
X_test = test_df[features]
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

In [18]:
# 🧪 7. Stratified K-Fold Training
train_df["cost_bin"] = pd.qcut(train_df["total_cost"], 5, labels=False)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

val_maes = []
fold_preds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, train_df["cost_bin"])):
    print(f"\n🔁 Fold {fold+1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_val, y_val, cat_features=cat_cols)
    test_pool = Pool(X_test, cat_features=cat_cols)

    tmp_dir = tempfile.mkdtemp()
    model = CatBoostRegressor(
        iterations=1700,
        learning_rate=0.024,
        depth=9,
        l2_leaf_reg=7,
        subsample=0.83,
        random_seed=42 + fold,
        loss_function='Quantile:alpha=0.6',
        early_stopping_rounds=100,
        verbose=100,
        train_dir=tmp_dir
    )
    model.fit(train_pool, eval_set=val_pool)
    shutil.rmtree(tmp_dir, ignore_errors=True)

    val_preds = np.expm1(model.predict(val_pool))
    fold_preds.append(np.expm1(model.predict(test_pool)))
    y_val_exp = np.expm1(y_val)
    mae = mean_absolute_error(y_val_exp, val_preds)
    print(f"✅ Fold {fold+1} MAE: {mae:.2f}")
    val_maes.append(mae)


🔁 Fold 1
0:	learn: 0.6186626	test: 0.6191914	best: 0.6191914 (0)	total: 77.3ms	remaining: 2m 11s
100:	learn: 0.3389830	test: 0.3705731	best: 0.3705731 (100)	total: 7.58s	remaining: 2m
200:	learn: 0.3033332	test: 0.3607739	best: 0.3607739 (200)	total: 15.2s	remaining: 1m 53s
300:	learn: 0.2890309	test: 0.3599224	best: 0.3596126 (277)	total: 22.5s	remaining: 1m 44s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3596125674
bestIteration = 277

Shrink model to first 278 iterations.
✅ Fold 1 MAE: 3374604.04

🔁 Fold 2
0:	learn: 0.6191642	test: 0.6149150	best: 0.6149150 (0)	total: 53.2ms	remaining: 1m 30s
100:	learn: 0.3320227	test: 0.3795088	best: 0.3795088 (100)	total: 8.84s	remaining: 2m 20s
200:	learn: 0.2962688	test: 0.3740415	best: 0.3740415 (200)	total: 16.7s	remaining: 2m 4s
300:	learn: 0.2798927	test: 0.3734571	best: 0.3734571 (300)	total: 24.5s	remaining: 1m 53s
400:	learn: 0.2666621	test: 0.3740414	best: 0.3733348 (325)	total: 32.1s	remaining: 1m 44s
Stopped 

In [19]:
# 🧠 8. Ensemble
avg_mae = np.mean(val_maes)
print(f"\n✅ Average Validation MAE (5-Fold): {avg_mae:.2f}")
final_preds = np.mean(fold_preds, axis=0)
final_preds = np.clip(final_preds, 50000, 25000000)


✅ Average Validation MAE (5-Fold): 3397143.52


In [20]:
# 💾 9. Save Submission
output_path = os.path.join("..", "submission")
os.makedirs(output_path, exist_ok=True)

submission = pd.DataFrame({
    "ID": test_ids,
    "total_cost": final_preds
})
submission.to_csv(os.path.join(output_path, "submission.csv"), index=False)
print("✅ submission.csv saved to ../submission/")

✅ submission.csv saved to ../submission/
