In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
# ===== Load =====
df = pd.read_excel("Shanghai_Containerized_Freight_Prepared.xlsx", parse_dates=["date"])
df = df.sort_values("date").reset_index(drop=True)


In [None]:
# ===== Target / features =====
target = "europe_base_port"
feature_cols = [c for c in df.columns if ('lag_' in c or 'rollmean_' in c)]
X = df[feature_cols]
y = df[target]

# ===== Diagnostics =====
print("Autocorrelation of target (lag 1):", df[target].autocorr(lag=1))
print("Number of samples:", len(df))
print("Feature count:", len(feature_cols))
print("First few features:", feature_cols[:8], "\n")


In [None]:
# ===== Baseline: naive 'yesterday = today' predictor =====
y_true_base = y.iloc[1:]
y_pred_base = y.shift(1).iloc[1:]

# Errors
errors = y_true_base - y_pred_base
abs_errors = errors.abs()

# Metrics
r2_naive = r2_score(y_true_base, y_pred_base)
rmse_naive = np.sqrt(mean_squared_error(y_true_base, y_pred_base))
mae_naive = mean_absolute_error(y_true_base, y_pred_base)
mape_naive = np.mean(np.abs((y_true_base - y_pred_base) / y_true_base)) * 100
max_error = abs_errors.max()
max_error_date = df.loc[abs_errors.idxmax(), "date"]

print("=== Naive Baseline (previous week) ===")
print(f"RMSE : {rmse_naive:.2f}")
print(f"MAE  : {mae_naive:.2f}")
print(f"MAPE : {mape_naive:.2f}%")
print(f"R²   : {r2_naive:.3f}")
print(f"Max absolute error : {max_error:.2f} (on {max_error_date.date()})\n")


In [6]:
# ===== Train / Test Split =====
split = int(len(df) * 0.7)  # 70% train, 30% test
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

model = CatBoostRegressor(
    iterations=1000,
    depth=6,
    learning_rate=0.05,
    random_seed=42,
    od_type="Iter",
    od_wait=50,
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)


0:	learn: 2464.9859977	test: 1283.0286615	best: 1283.0286615 (0)	total: 2.6ms	remaining: 2.6s
100:	learn: 160.3751304	test: 543.0276284	best: 489.1221485 (52)	total: 344ms	remaining: 3.06s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 489.1221485
bestIteration = 52

Shrink model to first 53 iterations.


<catboost.core.CatBoostRegressor at 0x1d7f66662d0>

In [7]:
# ===== Evaluate =====
preds = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)
mape = np.mean(np.abs((y_test - preds) / y_test)) * 100
r2 = r2_score(y_test, preds)

print("\n=== Final Test Performance ===")
print(f"RMSE : {rmse:.2f}")
print(f"MAE  : {mae:.2f}")
print(f"MAPE : {mape:.2f}%")
print(f"R²   : {r2:.3f}")
print(f"(Naive baseline R²: {r2_naive:.3f})")



=== Final Test Performance ===
RMSE : 489.12
MAE  : 390.16
MAPE : 24.55%
R²   : 0.833
(Naive baseline R²: 0.994)
