In [1]:
# imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Last inn data
receivals = pd.read_csv("data/kernel/receivals.csv", parse_dates=["date_arrival"])
purchase_orders = pd.read_csv("data/kernel/purchase_orders.csv", parse_dates=["delivery_date"])
transportation = pd.read_csv("data/extended/transportation.csv")
materials = pd.read_csv("data/extended/materials.csv")

neg_orders = purchase_orders[purchase_orders['quantity'] <= 0]
purchase_orders_clean = purchase_orders[purchase_orders['quantity'] > 0]

In [13]:
# Summer receivals per ordrelinje
receivals_grouped = receivals.groupby(
    ["purchase_order_id", "purchase_order_item_no"]
).agg(
    delivered=("net_weight", "sum"),
    n_receivals=("receival_item_no", "nunique"),   # antall del-leveranser
    delivery_start=("date_arrival", "min"),
    delivery_end=("date_arrival", "max")
).reset_index()

# Merge med purchase_orders
merged = receivals_grouped.merge(
    purchase_orders_clean,
    on=["purchase_order_id", "purchase_order_item_no"],
    how="inner"
)

# Beregn delivery ratio
merged["delivery_ratio"] = merged["delivered"] / merged["quantity"]

# Threshold for "ekstrem"
threshold = 10
merged["extreme_flag"] = merged["delivery_ratio"] > threshold

# Fjerner ekstreme observasjoner
merged_clean = merged.loc[~merged["extreme_flag"]].copy()
merged_clean.reset_index(drop=True, inplace=True)

# Ta ut kun unike koblinger mellom product_id og rm_id
materials_map = materials[["product_id", "rm_id"]].drop_duplicates()

# Merge inn i merged_clean
merged_clean = merged_clean.merge(materials_map, on="product_id", how="left")

print(merged_clean.head(10))

   purchase_order_id  purchase_order_item_no  delivered  n_receivals  \
0              363.0                    10.0     8860.0            1   
1              363.0                    10.0     8860.0            1   
2              363.0                    10.0     8860.0            1   
3              363.0                    10.0     8860.0            1   
4              363.0                    10.0     8860.0            1   
5              363.0                    10.0     8860.0            1   
6              363.0                    10.0     8860.0            1   
7              363.0                    10.0     8860.0            1   
8              363.0                    10.0     8860.0            1   
9              363.0                    30.0    11540.0            1   

              delivery_start               delivery_end  quantity  \
0  2012-07-10 10:40:00+02:00  2012-07-10 10:40:00+02:00  150000.0   
1  2012-07-10 10:40:00+02:00  2012-07-10 10:40:00+02:00  150000.0   


In [9]:
df = merged_clean.copy()

for c in ["delivery_date", "created_date_time"]:
    df[c] = pd.to_datetime(df[c], errors="coerce", utc=True).dt.tz_localize(None)

df["delivery_date"] = pd.to_datetime(df["delivery_date"], errors="coerce")
df["created_date_time"] = pd.to_datetime(df["created_date_time"], errors="coerce")


df["delivery_year"]    = df["delivery_date"].dt.year
df["delivery_month"]   = df["delivery_date"].dt.month
df["delivery_weekday"] = df["delivery_date"].dt.weekday
df["lead_time_days"] = (df["delivery_date"] - df["created_date_time"]).dt.days
df["lead_time_days"] = df["lead_time_days"].clip(lower=0).fillna(df["lead_time_days"].median())

cat_cols = ["product_id", "product_version", "rm_id"]
encoders = {}
for col in cat_cols:
    df[col] = df[col].astype(str).fillna("MISSING")
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
    


In [10]:
features = [
    "quantity",
    "product_id", 
    "product_version", 
    "rm_id",
    "delivery_month", 
    "delivery_weekday",
    "lead_time_days",
]

x = df[features].copy()
y = df["delivered"].values


In [11]:
train_mask = df["delivery_date"] < pd.Timestamp("2024-01-01")
X_train, y_train = x[train_mask], y[train_mask]
X_test,  y_test  = x[~train_mask], y[~train_mask]

print("Train:", X_train.shape, " Test:", X_test.shape)

Train: (237760, 7)  Test: (8548, 7)


In [14]:
model = RandomForestRegressor(
    n_estimators = 100, 
    random_state = 42,
    max_depth= None,
    n_jobs= -1
    )

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

def quantile_loss(y_true, y_pred, q = 0.2):
    diff = y_true - y_pred
    return np.mean(np.maximum(q*diff, (1-q)*(-diff)))

qloss = quantile_loss(y_test, y_pred, q=0.2)
print("RMSE:", rmse)
print("Quantile loss (q=0.2):", qloss)

baseline_pred = X_test["quantity"].values
print("Baseline Quantile loss (q=0.2):", quantile_loss(y_test, baseline_pred, q=0.2))


RMSE: 122349.68329345141
Quantile loss (q=0.2): 17517.07342890465
Baseline Quantile loss (q=0.2): 24599.471127749184


In [None]:
import pandas as pd
import numpy as np

# --- 0) Forutsetninger ---
# - 'receivals' er lastet, og inneholder kolonnene: rm_id, net_weight, date_arrival
# - 'merged_clean' er datasettet du brukte til modelltrening
# - 'test' er delsettet av merged_clean med delivery_date i 2024 (samme rader som y_pred er beregnet for)
# - 'y_pred' er modellens prediksjoner på 'test' (per ordrelinje)
# - score() og ParticipantVisibleError er importert fra koden din

# Tips: hvis du tidligere label-encodet rm_id, sørg for å ha en "rå" kopi før encoding:
# df["rm_id_raw"] = original rm_id før encoding. Hvis du ikke har det, bruk merged_clean["rm_id"] direkte om den er ikke-encodet.

# --- 1) Normaliser datoer til datetime uten tz (for robuste filtrer) ---
receivals["date_arrival"] = pd.to_datetime(receivals["date_arrival"], errors="coerce", utc=True).dt.tz_localize(None)

# --- 2) Lag solution: faktisk levert (A) pr rm_id i jan–mai 2024 ---
start = pd.Timestamp("2024-01-01")
end   = pd.Timestamp("2024-05-31 23:59:59")
solution = (receivals.loc[(receivals["date_arrival"] >= start) & (receivals["date_arrival"] <= end)]
            .groupby("rm_id", as_index=False)
            .agg(weight=("net_weight", "sum"))
           ).rename(columns={"rm_id": "ID"})

# --- 3) Lag submission: predikert vekt (F) pr rm_id i samme vindu ---
# Sørg for at 'test' har en kolonne med den ekte rm_id (ikke-encodet). Hvis du bare har encodet verdi,
# hent rm_id fra merged_clean før encoding, f.eks. via en ekstra kolonne 'rm_id_raw'.
rm_col = "rm_id_raw" if "rm_id_raw" in test.columns else "rm_id"

preds = pd.DataFrame({
    "ID": test[rm_col].values,
    "predicted_weight": np.clip(y_pred, 0, None)  # for sikkerhets skyld, ikke-negative
})

submission = preds.groupby("ID", as_index=False).agg(predicted_weight=("predicted_weight", "sum"))

# --- 4) Sørg for at ALLE ID-er i solution finnes i submission (ellers feiler score()) ---
# Konservativt valg: manglende ID-er får 0 som prediksjon
submission = solution[["ID"]].merge(submission, on="ID", how="left")
submission["predicted_weight"] = submission["predicted_weight"].fillna(0.0)

# --- 5) Kjør metrikken din ---
try:
    final_score = score(solution=solution, submission=submission, row_id_column_name="ID")
    print("Quantile loss (q=0.2) – backtest jan–mai 2024:", final_score)
except ParticipantVisibleError as e:
    print("Scoring feilet:", e)