In [17]:
pip install xgboost


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [19]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
interactions = pd.read_csv("interactions.csv")

train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()
interactions.columns = interactions.columns.str.strip()

In [20]:
interactions_15 = interactions[interactions["days_before_service"] >= 15]

service_features = interactions_15.groupby(
    ["service_date", "origin_hub_id", "destination_hub_id"]
).agg(
    commitments_15=("cumulative_commitments", "max"),
    interest_15=("cumulative_interest_signals", "max")
).reset_index()

In [21]:
train = train.merge(service_features,
                    on=["service_date","origin_hub_id","destination_hub_id"],
                    how="left")

test = test.merge(service_features,
                   on=["service_date","origin_hub_id","destination_hub_id"],
                   how="left")

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [22]:
for df in [train, test]:
    df["commit_interest_ratio"] = df["commitments_15"] / (df["interest_15"] + 1)
    df["commit_minus_interest"] = df["commitments_15"] - df["interest_15"]
    df["commit_plus_interest"] = df["commitments_15"] + df["interest_15"]
    df["commit_times_interest"] = df["commitments_15"] * df["interest_15"]

    df["service_day"] = pd.to_datetime(df["service_date"], dayfirst=True).dt.day
    df["service_month"] = pd.to_datetime(df["service_date"], dayfirst=True).dt.month
    df["service_weekday"] = pd.to_datetime(df["service_date"], dayfirst=True).dt.weekday

In [23]:
origin_avg = train.groupby("origin_hub_id")["final_service_units"].mean().reset_index()
origin_avg.rename(columns={"final_service_units":"origin_hub_avg"}, inplace=True)

dest_avg = train.groupby("destination_hub_id")["final_service_units"].mean().reset_index()
dest_avg.rename(columns={"final_service_units":"destination_hub_avg"}, inplace=True)

train = train.merge(origin_avg, on="origin_hub_id", how="left")
train = train.merge(dest_avg, on="destination_hub_id", how="left")

test = test.merge(origin_avg, on="origin_hub_id", how="left")
test = test.merge(dest_avg, on="destination_hub_id", how="left")

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [24]:
for col in ["origin_hub_id","destination_hub_id"]:
    train[col] = train[col].astype("category").cat.codes
    test[col] = test[col].astype("category").cat.codes

In [25]:
features = [
    "origin_hub_id","destination_hub_id",
    "commitments_15","interest_15",
    "commit_interest_ratio","commit_minus_interest",
    "commit_plus_interest","commit_times_interest",
    "service_day","service_month","service_weekday",
    "origin_hub_avg","destination_hub_avg"
]

X = train[features]
y = train["final_service_units"]

In [26]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [32]:
model = XGBRegressor(
    n_estimators=1200,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [28]:
y_pred = model.predict(X_val)
y_pred = np.clip(y_pred, 0, None)
y_pred = y_pred.round(0)

mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
accuracy = r2 * 100

print(f"MAE : {mae:.2f}")
print(f"MSE : {mse:.2f}")
print(f"R2  : {r2:.2f}")
print(f"Accuracy : {accuracy:.2f}%")

MAE : 260.63
MSE : 150602.92
R2  : 0.90
Accuracy : 89.54%


In [29]:
model.fit(X, y)

test_preds = model.predict(test[features])
test_preds = np.clip(test_preds, 0, None)
test_preds = test_preds.round(0).astype(int)

test["final_service_units"] = test_preds


In [31]:
submission = test[["service_key","final_service_units"]]
submission.to_csv("file.csv", index=False)

print(" file.csv generated successfully")

 file.csv generated successfully
