In [1]:
# 📦 1. Import Libraries
# ====================================================
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import os
import tempfile
import shutil

In [2]:
# 📥 2. Load Datasets
# ====================================================
# Paths to data
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, ".."))
TRAIN_PATH = os.path.join(PROJECT_ROOT, "data", "Train.csv")
TEST_PATH = os.path.join(PROJECT_ROOT, "data", "Test.csv")

# Load data
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

test_ids = test_df["ID"]

In [3]:
# 🎯 3. Target Transformation (clip + log1p)
# ====================================================
train_df["total_cost"] = train_df["total_cost"].clip(upper=25000000)
train_df["log_total_cost"] = np.log1p(train_df["total_cost"])

In [4]:
# 🌍 4. Region Mapping for Country Column
# ====================================================
region_map = {
    'KENYA': 'Africa', 'TANZANIA': 'Africa', 'UGANDA': 'Africa', 'ETHIOPIA': 'Africa',
    'UNITED STATES OF AMERICA': 'North America', 'CANADA': 'North America',
    'GERMANY': 'Europe', 'FRANCE': 'Europe', 'UNITED KINGDOM': 'Europe', 'ITALY': 'Europe',
    'CHINA': 'Asia', 'JAPAN': 'Asia', 'INDIA': 'Asia',
}
default_region = 'Other'

In [5]:
# 🧠 5. Feature Engineering Function
# ====================================================
def engineer_features(df):
    df["region"] = df["country"].map(region_map).fillna(default_region)
    df["total_people"] = df["total_female"].fillna(0) + df["total_male"].fillna(0)
    df["total_nights"] = df["night_mainland"] + df["night_zanzibar"]
    df["has_spouse"] = df["travel_with"].apply(lambda x: 1 if x == "Spouse" else 0)
    df["people_x_nights"] = df["total_people"] * df["total_nights"]
    df["mainland_ratio"] = df["night_mainland"] / (df["total_nights"] + 1e-6)
    df["is_luxury_trip"] = df["main_activity"].apply(lambda x: 1 if x in ["Wildlife tourism", "Beach tourism"] else 0)
    df["is_family_trip"] = df["travel_with"].apply(lambda x: 1 if x in ["Spouse", "Children"] else 0)
    df["purpose_x_activity"] = df["purpose"] + "_" + df["main_activity"]

    for col in ["total_female", "total_male"]:
        df[col] = df[col].fillna(0)
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

# Apply feature engineering
train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

In [6]:
# 🏷️ 6. Feature Selection
# ====================================================
features = [
    'region', 'age_group', 'travel_with', 'purpose', 'main_activity', 'info_source',
    'tour_arrangement', 'package_transport_int', 'package_accomodation', 'package_food',
    'package_transport_tz', 'package_sightseeing', 'package_guided_tour',
    'package_insurance', 'payment_mode', 'first_trip_tz', 'most_impressing',
    'total_female', 'total_male', 'night_mainland', 'night_zanzibar',
    'total_people', 'total_nights', 'has_spouse', 'people_x_nights',
    'mainland_ratio', 'is_luxury_trip', 'is_family_trip', 'purpose_x_activity'
]

X = train_df[features]
y = train_df["log_total_cost"]
X_test = test_df[features]
cat_cols = X.select_dtypes(include="object").columns.tolist()



In [7]:
# 🧪 7. Train/Validation Split
# ====================================================
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
val_pool = Pool(X_val, y_val, cat_features=cat_cols)
test_pool = Pool(X_test, cat_features=cat_cols)

In [8]:
# 🧠 8. Train CatBoost Model 
# ====================================================
tmp_dir = tempfile.mkdtemp() # To Disable Logs

model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.03,
    depth=10,
    l2_leaf_reg=5,
    subsample=0.9,
    random_seed=42,
    loss_function='Quantile:alpha=0.5',
    early_stopping_rounds=75,
    verbose=100,
    train_dir=tmp_dir 
)
model.fit(train_pool, eval_set=val_pool)
shutil.rmtree(tmp_dir, ignore_errors=True)


0:	learn: 0.6663278	test: 0.6528042	best: 0.6528042 (0)	total: 228ms	remaining: 5m 42s
100:	learn: 0.3207369	test: 0.3946342	best: 0.3946342 (100)	total: 8.32s	remaining: 1m 55s
200:	learn: 0.2737341	test: 0.3897525	best: 0.3897164 (195)	total: 16.3s	remaining: 1m 45s
300:	learn: 0.2444943	test: 0.3895439	best: 0.3893289 (294)	total: 24.4s	remaining: 1m 37s
Stopped by overfitting detector  (75 iterations wait)

bestTest = 0.3893288575
bestIteration = 294

Shrink model to first 295 iterations.


In [9]:
# 📊 9. Validation Score (MAE)
# ====================================================
val_preds_log = model.predict(val_pool)
val_preds = np.expm1(val_preds_log)
y_val_exp = np.expm1(y_val)
mae = mean_absolute_error(y_val_exp, val_preds)
print(f"✅ Final Validation MAE: {mae:.2f}")

✅ Final Validation MAE: 3312868.38


In [10]:
# 📤 10. Final Test Prediction + Clipping
# ====================================================
test_preds_log = model.predict(test_pool)
test_preds = np.expm1(test_preds_log)
final_preds = np.clip(test_preds, 50000, 25000000)

In [12]:
# 💾 11. Generate Submission File
# ====================================================

# Go inside submission/
output_path = os.path.join("..", "submission")
os.makedirs(output_path, exist_ok=True)

# Save the file in ../submission/submission.csv
submission = pd.DataFrame({
    "ID": test_ids,
    "total_cost": final_preds
})
submission.to_csv(os.path.join(output_path, "submission.csv"), index=False)
print("✅ Submission.csv saved.")


✅ Submission.csv saved.
