In [2]:
# imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
# Last inn data
receivals = pd.read_csv("data/kernel/receivals.csv", parse_dates=["date_arrival"])
purchase_orders = pd.read_csv("data/kernel/purchase_orders.csv", parse_dates=["delivery_date"])
transportation = pd.read_csv("data/extended/transportation.csv")
materials = pd.read_csv("data/extended/materials.csv")

neg_orders = purchase_orders[purchase_orders['quantity'] <= 0]
purchase_orders_clean = purchase_orders[purchase_orders['quantity'] > 0]

In [4]:
neg_rec = receivals[receivals['net_weight'] <= 0]
receivals_clean = receivals[receivals['net_weight'] > 0]

print(receivals[receivals['purchase_order_id'] == 267468])




Empty DataFrame
Columns: [rm_id, product_id, purchase_order_id, purchase_order_item_no, receival_item_no, batch_id, date_arrival, receival_status, net_weight, supplier_id]
Index: []


In [5]:
# unique purchase_order_id in receivals
print(len(purchase_orders_clean[purchase_orders_clean['status'] == 'Deleted']))

3191


In [None]:
# Features vi vil ha fra purchase_orders
purchase_orders_base = purchase_orders_clean[
    [
        "purchase_order_id",
        "purchase_order_item_no",
        "quantity",
        "product_id",
        "product_version",
        "status",
        "created_date_time"
    ]
].copy()

# Leveranser per ordrelinje
receivals_grouped_by_lines = (
    receivals_clean
    .groupby(["purchase_order_id", "purchase_order_item_no"], as_index=False)
    .agg(
        delivered=("net_weight", "sum"),
        n_receivals=("receival_item_no", "nunique"),
        delivery_start=("date_arrival", "min"),
        rm_id_line=("rm_id", "first")  # antar én rm_id per linje
    )
)

# Slår sammen purchase_orders med leveranser per linje
pur_rec_merged_by_line = purchase_orders_base.merge(
    receivals_grouped_by_lines,
    on=["purchase_order_id", "purchase_order_item_no"],
    how="inner",
    validate="one_to_one"
)

# Fyll manglende leveranser med 0 og sikre typer
pur_rec_merged_by_line["delivered"] = pur_rec_merged_by_line["delivered"].fillna(0.0)
pur_rec_merged_by_line["n_receivals"] = pur_rec_merged_by_line["n_receivals"].fillna(0).astype(int)

# 5. Linje-fill rate (leverte kg / bestilt kg)
pur_rec_merged_by_line["line_fill_ratio"] = pur_rec_merged_by_line["delivered"] / pur_rec_merged_by_line["quantity"]

# 6. Ordrenivå-aggregat (en rad per purchase_order_id) – bruk de normaliserte datoene
order_level = (
    pur_rec_merged_by_line
    .groupby("purchase_order_id", as_index=False)
    .agg(
        ordered_quantity=("quantity", "sum"),
        delivered_total=("delivered", "sum"),
        n_lines=("purchase_order_item_no", "nunique"),
        n_lines_with_delivery=("delivered", lambda s: (s > 0).sum()),
        time_created_order=("created_date_time", "min"),
        time_first_delivery=("delivery_start", "min"),
        status_po=("status", "first"),
        n_unique_rm_ids=("rm_id_line", "nunique"),
    )
)

# 7. Ordre-fill ratio + andel linjer med leveranse
order_level["delivery_ratio"] = order_level["delivered_total"] / order_level["ordered_quantity"]
order_level["line_delivery_fraction"] = order_level["n_lines_with_delivery"] / order_level["n_lines"]

# 8. Ekstremfilter (valgfritt): fjern åpenbart urimelige ratioer
threshold = 10
order_level["extreme_flag"] = order_level["delivery_ratio"] > threshold
order_clean = order_level.loc[~order_level["extreme_flag"]].copy()
order_clean.reset_index(drop=True, inplace=True)

# 9. (Valgfritt) Berik med rm_id / product mapping.
product_agg = (
    pur_rec_merged_by_line.groupby("purchase_order_id")
    .agg(
        product_ids=("product_id", lambda s: list(sorted(set(s)))),
        rm_ids=("rm_id_line", lambda s: list(sorted(set(s.dropna()))))
    )
    .reset_index()
)
orders_and_receivals_merged = order_clean.merge(product_agg, on="purchase_order_id", how="left")

print("Ordre (rå):", order_level.shape, "- Etter ekstremfilter:", order_clean.shape)
print(orders_and_receivals_merged.head(10))

Ordre (rå): (7171, 12) - Etter ekstremfilter: (7150, 12)
   purchase_order_id  ordered_quantity  delivered_total  n_lines  \
0                363          300000.0          20400.0        2   
1                365          150000.0           2460.0        1   
2                370          150000.0           6340.0        1   
3             206172          640660.0         107270.0        2   
4             208490         1701000.0         918561.0        2   
5             208532         5093550.0        2248369.0        5   
6             208533         1000000.0         501775.0        3   
7             208535         6600000.0        2521945.0        4   
8             208537         8975000.0        3527250.0        4   
9             208538         1000000.0         434980.0        1   

   n_lines_with_delivery                  time_created_order  \
0                      2  2012-07-04 13:58:15.0000000 +00:00   
1                      1  2012-07-04 14:12:02.0000000 +00:00   
2 

In [16]:
df = orders_and_receivals_merged.copy()

for c in ["time_first_delivery", "time_created_order"]:
    df[c] = pd.to_datetime(df[c], errors="coerce", utc=True).dt.tz_localize(None)

df["created_date_time"] = pd.to_datetime(df["time_created_order"], errors="coerce")

df["delivery_year"]    = df["time_first_delivery"].dt.year
df["delivery_month"]   = df["time_first_delivery"].dt.month
df["delivery_weekday"] = df["time_first_delivery"].dt.weekday
df["lead_time_days"] = (df["time_first_delivery"] - df["created_date_time"]).dt.days
df["lead_time_days"] = df["lead_time_days"].clip(lower=0).fillna(df["lead_time_days"].median())

# Fjern rmid med Nan
df = df.dropna(subset=["rm_ids"])

cat_cols = ["product_ids"]
encoders = {}
for col in cat_cols:
    df[col] = df[col].astype(str).fillna("MISSING")
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
    


In [18]:
features = [
    "ordered_quantity",
    "delivered_total",  
    "rm_ids",
    "delivery_year",
    "delivery_month", 
    "delivery_weekday",
    "lead_time_days",
    "product_ids",
    "delivery_ratio",
    "created_date_time"
]

x = df[features].copy()
y = df["delivered_total"].values


In [21]:
train_mask = df["delivery_year"] < 2024

X_train, y_train = x[train_mask], y[train_mask]
X_test,  y_test  = x[~train_mask], y[~train_mask]

# Behold ID-informasjonen også
test = df.loc[~train_mask].copy()

print(X_test.head(10))

      ordered_quantity  delivered_total    rm_ids  delivery_year  \
6580           25000.0          24452.0  [2130.0]           2024   
6661            3240.0           3240.0  [2124.0]           2024   
6662          150000.0         140580.0  [2741.0]           2024   
6663          400000.0         388940.0  [3125.0]           2024   
6664          225000.0         230400.0  [3126.0]           2024   
6665          200000.0         204020.0  [3124.0]           2024   
6666          500000.0         496740.0  [3282.0]           2024   
6670          300000.0         300220.0  [3123.0]           2024   
6671           75000.0          75140.0  [3265.0]           2024   
6672          150000.0         149860.0  [3126.0]           2024   

      delivery_month  delivery_weekday  lead_time_days  product_ids  \
6580               1                 4             116           58   
6661               1                 0              38            3   
6662               1                 3

In [None]:
model = GradientBoostingRegressor(
    n_estimators=1000,
    random_state=42,
    learning_rate = 0.01,
    loss = 'absolute_error',
    subsample = 0.1,
    max_features = 'log2',
    

)  
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

def quantile_loss(y_true, y_pred, q = 0.2):
    diff = y_true - y_pred
    return np.mean(np.maximum(q*diff, (1-q)*(-diff)))

qloss = quantile_loss(y_test, y_pred, q=0.2)
print("RMSE:", rmse)
print("Quantile loss (q=0.2):", qloss)

baseline_pred = X_test["quantity"].values
print("Baseline Quantile loss (q=0.2):", quantile_loss(y_test, baseline_pred, q=0.2))


' model = GradientBoostingRegressor(\n    n_estimators=1000,\n    random_state=42,\n    learning_rate = 0.01,\n    loss = \'absolute_error\',\n    subsample = 0.1,\n    max_features = \'log2\',\n    \n\n)  \nmodel.fit(X_train, y_train)\ny_pred = model.predict(X_test)\n\nrmse = np.sqrt(mean_squared_error(y_test, y_pred))\n\ndef quantile_loss(y_true, y_pred, q = 0.2):\n    diff = y_true - y_pred\n    return np.mean(np.maximum(q*diff, (1-q)*(-diff)))\n\nqloss = quantile_loss(y_test, y_pred, q=0.2)\nprint("RMSE:", rmse)\nprint("Quantile loss (q=0.2):", qloss)\n\nbaseline_pred = X_test["quantity"].values\nprint("Baseline Quantile loss (q=0.2):", quantile_loss(y_test, baseline_pred, q=0.2))\n '

In [43]:
""" from kaggle_metric import score, ParticipantVisibleError


receivals["date_arrival"] = pd.to_datetime(receivals["date_arrival"], errors="coerce", utc=True).dt.tz_localize(None)


start = pd.Timestamp("2024-01-01")
end   = pd.Timestamp("2024-05-31 23:59:59")
solution = (receivals.loc[(receivals["date_arrival"] >= start) & (receivals["date_arrival"] <= end)]
            .groupby("rm_id", as_index=False)
            .agg(weight=("net_weight", "sum"))
           ).rename(columns={"rm_id": "ID"})


rm_col = "rm_id_raw" if "rm_id_raw" in test.columns else "rm_id"

preds = pd.DataFrame({
    "ID": test[rm_col].values,
    "predicted_weight": np.clip(y_pred, 0, None)  
})

submission = preds.groupby("ID", as_index=False).agg(predicted_weight=("predicted_weight", "sum"))


submission = solution[["ID"]].merge(submission, on="ID", how="left")
submission["predicted_weight"] = submission["predicted_weight"].fillna(0.0)


try:
    final_score = score(solution=solution, submission=submission, row_id_column_name="ID")
    print("Quantile loss (q=0.2) – backtest jan–mai 2024:", final_score)
except ParticipantVisibleError as e:
    print("Scoring feilet:", e)
    

print(submission.head())
print(solution.head()) """

' from kaggle_metric import score, ParticipantVisibleError\n\n\nreceivals["date_arrival"] = pd.to_datetime(receivals["date_arrival"], errors="coerce", utc=True).dt.tz_localize(None)\n\n\nstart = pd.Timestamp("2024-01-01")\nend   = pd.Timestamp("2024-05-31 23:59:59")\nsolution = (receivals.loc[(receivals["date_arrival"] >= start) & (receivals["date_arrival"] <= end)]\n            .groupby("rm_id", as_index=False)\n            .agg(weight=("net_weight", "sum"))\n           ).rename(columns={"rm_id": "ID"})\n\n\nrm_col = "rm_id_raw" if "rm_id_raw" in test.columns else "rm_id"\n\npreds = pd.DataFrame({\n    "ID": test[rm_col].values,\n    "predicted_weight": np.clip(y_pred, 0, None)  \n})\n\nsubmission = preds.groupby("ID", as_index=False).agg(predicted_weight=("predicted_weight", "sum"))\n\n\nsubmission = solution[["ID"]].merge(submission, on="ID", how="left")\nsubmission["predicted_weight"] = submission["predicted_weight"].fillna(0.0)\n\n\ntry:\n    final_score = score(solution=solution,