In [24]:
# imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
# Last inn data
receivals = pd.read_csv("data/kernel/receivals.csv", parse_dates=["date_arrival"])
purchase_orders = pd.read_csv("data/kernel/purchase_orders.csv", parse_dates=["delivery_date"])
transportation = pd.read_csv("data/extended/transportation.csv")
materials = pd.read_csv("data/extended/materials.csv")

neg_orders = purchase_orders[purchase_orders['quantity'] <= 0]
purchase_orders_clean = purchase_orders[purchase_orders['quantity'] > 0]



In [4]:
# Summer receivals per ordrelinje
receivals_grouped = receivals.groupby(
    ["purchase_order_id", "purchase_order_item_no"]
).agg(
    delivered=("net_weight", "sum"),
    n_receivals=("receival_item_no", "nunique"),   # antall del-leveranser
    delivery_start=("date_arrival", "min"),
    delivery_end=("date_arrival", "max")
).reset_index()

# Merge med purchase_orders
merged = receivals_grouped.merge(
    purchase_orders_clean,
    on=["purchase_order_id", "purchase_order_item_no"],
    how="inner"
)

# Beregn delivery ratio
merged["delivery_ratio"] = merged["delivered"] / merged["quantity"]

# Threshold for "ekstrem"
threshold = 10
merged["extreme_flag"] = merged["delivery_ratio"] > threshold

# Fjerner ekstreme observasjoner
merged_clean = merged.loc[~merged["extreme_flag"]].copy()
merged_clean.reset_index(drop=True, inplace=True)

# Ta ut kun unike koblinger mellom product_id og rm_id
materials_map = materials[["product_id", "rm_id"]].drop_duplicates()

# Merge inn i merged_clean
merged_clean = merged_clean.merge(materials_map, on="product_id", how="left")

print(merged_clean.head(10))

   purchase_order_id  purchase_order_item_no  delivered  n_receivals  \
0              363.0                    10.0     8860.0            1   
1              363.0                    10.0     8860.0            1   
2              363.0                    10.0     8860.0            1   
3              363.0                    10.0     8860.0            1   
4              363.0                    10.0     8860.0            1   
5              363.0                    10.0     8860.0            1   
6              363.0                    10.0     8860.0            1   
7              363.0                    10.0     8860.0            1   
8              363.0                    10.0     8860.0            1   
9              363.0                    30.0    11540.0            1   

              delivery_start               delivery_end  quantity  \
0  2012-07-10 10:40:00+02:00  2012-07-10 10:40:00+02:00  150000.0   
1  2012-07-10 10:40:00+02:00  2012-07-10 10:40:00+02:00  150000.0   


In [5]:
df = merged_clean.copy()

for c in ["delivery_date", "created_date_time"]:
    df[c] = pd.to_datetime(df[c], errors="coerce", utc=True).dt.tz_localize(None)

df["delivery_date"] = pd.to_datetime(df["delivery_date"], errors="coerce")
df["created_date_time"] = pd.to_datetime(df["created_date_time"], errors="coerce")


df["delivery_year"]    = df["delivery_date"].dt.year
df["delivery_month"]   = df["delivery_date"].dt.month
df["delivery_weekday"] = df["delivery_date"].dt.weekday
df["lead_time_days"] = (df["delivery_date"] - df["created_date_time"]).dt.days
df["lead_time_days"] = df["lead_time_days"].clip(lower=0).fillna(df["lead_time_days"].median())

# Fjern rmid med Nan
df = df.dropna(subset=["rm_id"])

cat_cols = ["product_id", "product_version"]
encoders = {}
for col in cat_cols:
    df[col] = df[col].astype(str).fillna("MISSING")
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
    


In [45]:
features = [
    "quantity",
    "product_id", 
    "product_version", 
    "rm_id",
    "delivery_year",
    "delivery_month", 
    "delivery_weekday",
    "lead_time_days",
]

x = df[features].copy()
y = df["delivered"].values


In [47]:
train_mask = df["delivery_date"] < pd.Timestamp("2024-01-01")

X_train, y_train = x[train_mask], y[train_mask]
X_test,  y_test  = x[~train_mask], y[~train_mask]

# Behold ID-informasjonen også
test = df.loc[~train_mask].copy()

print(X_test.head(10))

        quantity  product_id  product_version   rm_id  delivery_year  \
236776   25000.0          35                4  3581.0           2025   
236777   25000.0          35                4  4343.0           2025   
236778   25000.0          35                4  4481.0           2025   
237653    3240.0           3                2  1876.0           2024   
237654    3240.0           3                2  2124.0           2024   
237655   25000.0          22                6  2741.0           2024   
237656   25000.0          22                6  2741.0           2024   
237657   25000.0          22                6  2741.0           2024   
237658   25000.0          22                6  2741.0           2024   
237659   25000.0          22                6  2741.0           2024   

        delivery_month  delivery_weekday  lead_time_days  
236776               4                 1             575  
236777               4                 1             575  
236778               4        

In [None]:
model = GradientBoostingRegressor(
    n_estimators=1000,
    random_state=42,
    learning_rate = 0.01,
    loss = 'absolute_error',
    subsample = 0.1,
    max_features = 'log2',
    

)  
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

def quantile_loss(y_true, y_pred, q = 0.2):
    diff = y_true - y_pred
    return np.mean(np.maximum(q*diff, (1-q)*(-diff)))

qloss = quantile_loss(y_test, y_pred, q=0.2)
print("RMSE:", rmse)
print("Quantile loss (q=0.2):", qloss)

baseline_pred = X_test["quantity"].values
print("Baseline Quantile loss (q=0.2):", quantile_loss(y_test, baseline_pred, q=0.2))


RMSE: 145791.56986372845
Quantile loss (q=0.2): 20261.43728539659
Baseline Quantile loss (q=0.2): 24601.578003978007


In [51]:
from kaggle_metric import score, ParticipantVisibleError


receivals["date_arrival"] = pd.to_datetime(receivals["date_arrival"], errors="coerce", utc=True).dt.tz_localize(None)


start = pd.Timestamp("2024-01-01")
end   = pd.Timestamp("2024-05-31 23:59:59")
solution = (receivals.loc[(receivals["date_arrival"] >= start) & (receivals["date_arrival"] <= end)]
            .groupby("rm_id", as_index=False)
            .agg(weight=("net_weight", "sum"))
           ).rename(columns={"rm_id": "ID"})


rm_col = "rm_id_raw" if "rm_id_raw" in test.columns else "rm_id"

preds = pd.DataFrame({
    "ID": test[rm_col].values,
    "predicted_weight": np.clip(y_pred, 0, None)  
})

submission = preds.groupby("ID", as_index=False).agg(predicted_weight=("predicted_weight", "sum"))


submission = solution[["ID"]].merge(submission, on="ID", how="left")
submission["predicted_weight"] = submission["predicted_weight"].fillna(0.0)


try:
    final_score = score(solution=solution, submission=submission, row_id_column_name="ID")
    print("Quantile loss (q=0.2) – backtest jan–mai 2024:", final_score)
except ParticipantVisibleError as e:
    print("Scoring feilet:", e)
    

print(submission.head())
print(solution.head())

Quantile loss (q=0.2) – backtest jan–mai 2024: 6523553.691872911
       ID  predicted_weight
0  2124.0      1.130207e+04
1  2125.0      1.603308e+04
2  2129.0      9.256538e+04
3  2130.0      3.069798e+07
4  2131.0      1.800674e+06
       ID     weight
0  2124.0     7560.0
1  2125.0    25000.0
2  2129.0    69980.0
3  2130.0  3549704.0
4  2131.0   237344.0
