In [134]:
# imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [93]:
# Last inn data
purchase_orders = pd.read_csv("data/kernel/purchase_orders.csv", parse_dates=["delivery_date", "created_date_time", "modified_date_time"])
receivals = pd.read_csv("data/kernel/receivals.csv", parse_dates=["date_arrival"])
raw_material = pd.read_csv("data/extended/materials.csv")
transportation = pd.read_csv("data/extended/transportation.csv")


purchase_orders_clean = purchase_orders[purchase_orders['quantity'] > 0]
purchase_orders_clean = purchase_orders_clean[purchase_orders_clean['status'] != 'Deleted']
receivals_clean = receivals[receivals['net_weight'] > 0]

purchase_base = purchase_orders_clean[["purchase_order_id", 
                                       "quantity",
                                       "created_date_time",
                                       "purchase_order_item_no",
                                       ]]

receivals_base = receivals_clean[["rm_id",
                                 "purchase_order_id",
                                 "purchase_order_item_no",
                                 "product_id",
                                 "date_arrival",
                                 "net_weight"
                                 ]]

rec_ord_merged = receivals_base.merge(
    purchase_base,
    on = ["purchase_order_id", "purchase_order_item_no"],
    how = "inner"
    )



In [173]:
df = rec_ord_merged.copy()

for c in ["date_arrival", "created_date_time"]:
    df[c] = pd.to_datetime(df[c], errors="coerce", utc=True).dt.tz_localize(None)
    
df["delivery_year"]    = df["date_arrival"].dt.year
df["delivery_month"]   = df["date_arrival"].dt.month
df["delivery_weekday"] = df["date_arrival"].dt.weekday
df["lead_time_days"] = (df["date_arrival"] - df["created_date_time"]).dt.days

df["delivery_month_sin"] = np.sin(2 * np.pi * df["delivery_month"]/12)
df["delivery_month_cos"] = np.cos(2 * np.pi * df["delivery_month"]/12)




In [None]:


features = [
    'rm_id',
    'product_id',
    'quantity',
    'delivery_year',
    'delivery_month_sin',
    'delivery_weekday',
    'lead_time_days'
]

x = df[features].copy()
y = df["net_weight"].values

In [291]:
train_mask = df["date_arrival"] < pd.Timestamp("2024-01-01")

X_train, Y_train = x[train_mask], y[train_mask]
X_test,  Y_test  = x[~train_mask], y[~train_mask]

# Behold ID-informasjonen også
test = df.loc[~train_mask].copy()

In [292]:
model = HistGradientBoostingRegressor(
    learning_rate= 0.05,
    max_depth= 6,
    random_state= 42,
    loss= 'quantile',
    quantile= 0.2,
    max_iter= 500,
    verbose= 1 
)

model.fit(X_train, Y_train)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
print(f"RMSE: {rmse}")

def quantile_loss(y_true, y_pred, q):
    diff = y_true - y_pred
    return np.mean(np.maximum(q * diff, (q - 1) * diff))

quantile_loss(Y_test, y_pred, 0.2)

Binning 0.007 GB of training data: 0.012 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/500] 1 tree, 31 leaves, max depth = 6, train loss: 2169.86611, val loss: 2179.62176, in 0.008s
[2/500] 1 tree, 31 leaves, max depth = 6, train loss: 2117.76501, val loss: 2127.48403, in 0.007s
[3/500] 1 tree, 31 leaves, max depth = 6, train loss: 2068.71316, val loss: 2078.78197, in 0.007s
[4/500] 1 tree, 31 leaves, max depth = 6, train loss: 2024.60241, val loss: 2034.53146, in 0.009s
[5/500] 1 tree, 31 leaves, max depth = 6, train loss: 1983.36573, val loss: 1993.10614, in 0.006s
[6/500] 1 tree, 31 leaves, max depth = 6, train loss: 1944.95571, val loss: 1954.51925, in 0.008s
[7/500] 1 tree, 31 leaves, max depth = 6, train loss: 1909.08219, val loss: 1918.68019, in 0.003s
[8/500] 1 tree, 31 leaves, max depth = 6, train loss: 1875.81557, val loss: 1885.63717, in 0.013s
[9/500] 1 tree, 31 leaves, max depth = 6, train loss: 1844.47155, val loss: 1854.45339, in 0.00

np.float64(1265.1062871654872)

In [119]:
# Evaluering av modellen
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import pandas as pd

# Forutsetter at y_pred, Y_test, Y_train, model, X_train, X_test finnes fra forrige celle

# 1. Grunnmetrikker
mae = mean_absolute_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

# 2. Kvantil-tap (q=0.2) – bruker samme funksjon som definert i forrige celle
if 'quantile_loss' in globals():
    qloss = quantile_loss(Y_test, y_pred, 0.2)
else:
    def quantile_loss(y_true, y_hat, q):
        diff = y_true - y_hat
        return np.mean(np.maximum(q * diff, (q - 1) * diff))
    qloss = quantile_loss(Y_test, y_pred, 0.2)

# 3. Baselines
# a) Naiv gjennomsnitt av treningsmålet
baseline_mean = np.full_like(Y_test, Y_train.mean())
baseline_mean_rmse = np.sqrt(mean_squared_error(Y_test, baseline_mean))
# b) Bruke "quantity" som proxy (hvis den finnes i feature settet)
if 'quantity' in X_test.columns:
    baseline_quantity = X_test['quantity'].values
    baseline_quantity_rmse = np.sqrt(mean_squared_error(Y_test, baseline_quantity))
else:
    baseline_quantity_rmse = np.nan

# 4. Residualanalyse
residuals = Y_test - y_pred
res_summary = {
    'residual_mean': residuals.mean(),
    'residual_std': residuals.std(),
    'abs_resid_p50': np.percentile(np.abs(residuals), 50),
    'abs_resid_p90': np.percentile(np.abs(residuals), 90),
    'abs_resid_p95': np.percentile(np.abs(residuals), 95),
    'pct_over_predicted': (residuals < 0).mean(),  # hvor ofte vi overestimerer
}

# 5. Feature importance (kun for trebasert modell)
feature_importance = None
if hasattr(model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

# 6. Samle alt i en tabell
metrics = {
    'RMSE': float(rmse),
    'MAE': float(mae),
    'R2': float(r2),
    'QuantileLoss(q=0.2)': float(qloss),
    'BaselineMean_RMSE': float(baseline_mean_rmse),
    'BaselineQuantity_RMSE': float(baseline_quantity_rmse),
}

print("=== Modellmetrikker ===")
for k, v in metrics.items():
    print(f"{k:22s}: {v:.4f}")

print("\n=== Residualanalyse ===")
for k, v in res_summary.items():
    print(f"{k:22s}: {v:.4f}")

if feature_importance is not None:
    print("\n=== Viktigste features (topp 10) ===")
    print(feature_importance.head(10).to_string(index=False))

# 7. Enkel sanity check: Modell bør slå baseline_mean på RMSE
if rmse < baseline_mean_rmse:
    print("\nModellen slår baseline (gjennomsnitt) på RMSE.")
else:
    print("\nAdvarsel: Modellen slår ikke baseline (gjennomsnitt) – vurder flere features / tuning.")

# 8. Lag en liten DataFrame for inspeksjon
preview = pd.DataFrame({
    'y_true': Y_test[:20],
    'y_pred': y_pred[:20],
    'residual': residuals[:20]
})
print("\nEksempel (første 20 prediksjoner):")
print(preview.to_string(index=False))

=== Modellmetrikker ===
RMSE                  : 6970.3256
MAE                   : 4867.3751
R2                    : 0.3911
QuantileLoss(q=0.2)   : 1274.4167
BaselineMean_RMSE     : 9048.8461
BaselineQuantity_RMSE : 592749.8229

=== Residualanalyse ===
residual_mean         : 3864.2361
residual_std          : 5801.1308
abs_resid_p50         : 2748.2602
abs_resid_p90         : 12363.3172
abs_resid_p95         : 15179.8317
pct_over_predicted    : 0.2074

=== Viktigste features (topp 10) ===
         feature  importance
        quantity    0.616633
      product_id    0.148995
           rm_id    0.093872
  lead_time_days    0.070856
   delivery_year    0.060886
  delivery_month    0.005735
delivery_weekday    0.003022

Modellen slår baseline (gjennomsnitt) på RMSE.

Eksempel (første 20 prediksjoner):
 y_true       y_pred     residual
23460.0 23963.762821  -503.762821
23540.0 23922.736531  -382.736531
25260.0 24358.243679   901.756321
24480.0 23922.736531   557.263469
25020.0 24827.913376 

In [293]:
from kaggle_metric import score, ParticipantVisibleError


receivals["date_arrival"] = pd.to_datetime(receivals["date_arrival"], errors="coerce", utc=True).dt.tz_localize(None)


start = pd.Timestamp("2024-01-01")
end   = pd.Timestamp("2024-05-31 23:59:59")
solution = (receivals.loc[(receivals["date_arrival"] >= start) & (receivals["date_arrival"] <= end)]
            .groupby("rm_id", as_index=False)
            .agg(weight=("net_weight", "sum"))
           ).rename(columns={"rm_id": "ID"})


rm_col = "rm_id_raw" if "rm_id_raw" in test.columns else "rm_id"

preds = pd.DataFrame({
    "ID": test[rm_col].values,
    "predicted_weight": np.clip(y_pred, 0, None)  
})

submission = preds.groupby("ID", as_index=False).agg(predicted_weight=("predicted_weight", "sum"))


submission = solution[["ID"]].merge(submission, on="ID", how="left")
submission["predicted_weight"] = submission["predicted_weight"].fillna(0.0)


try:
    final_score = score(solution=solution, submission=submission, row_id_column_name="ID")
    print("Quantile loss (q=0.2) – backtest jan–mai 2024:", final_score)
except ParticipantVisibleError as e:
    print("Scoring feilet:", e)
    

print(submission.head(10))
print(solution.head(10))

Quantile loss (q=0.2) – backtest jan–mai 2024: 464964.2194653718
       ID  predicted_weight
0  2124.0      1.973111e+04
1  2125.0      2.903043e+04
2  2129.0      8.637025e+04
3  2130.0      8.240163e+06
4  2131.0      1.213245e+05
5  2132.0      5.989139e+04
6  2133.0      2.342615e+04
7  2134.0      3.030719e+05
8  2135.0      1.497985e+05
9  2140.0      1.851539e+06
       ID     weight
0  2124.0     7560.0
1  2125.0    25000.0
2  2129.0    69980.0
3  2130.0  3549704.0
4  2131.0   237344.0
5  2132.0   163266.0
6  2133.0    38740.0
7  2134.0   612846.0
8  2135.0   494030.0
9  2140.0  1046440.0
