In [1]:
# imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

# Viktig: bruk modul-import for low-level XGBoost API
import xgboost as xgb  # nå peker 'xgb' på modulen, ikke en modell-klasse

# (Valgfritt) dersom du senere vil teste sklearn-wrapperen:
# from xgboost import XGBRegressor

In [2]:
# Last inn data
purchase_orders = pd.read_csv("data/kernel/purchase_orders.csv", parse_dates=["delivery_date", "created_date_time", "modified_date_time"])
receivals = pd.read_csv("data/kernel/receivals.csv", parse_dates=["date_arrival"])
raw_material = pd.read_csv("data/extended/materials.csv")
transportation = pd.read_csv("data/extended/transportation.csv")


purchase_orders_clean = purchase_orders[purchase_orders['quantity'] > 0]
purchase_orders_clean = purchase_orders_clean[purchase_orders_clean['status'] != 'Deleted']
receivals_clean = receivals[receivals['net_weight'] > 0]

purchase_base = purchase_orders_clean[["purchase_order_id", 
                                       "quantity",
                                       "created_date_time",
                                       "purchase_order_item_no",
                                       ]]

receivals_base = receivals_clean[["rm_id",
                                 "purchase_order_id",
                                 "purchase_order_item_no",
                                 "product_id",
                                 "date_arrival",
                                 "net_weight"
                                 ]]

rec_ord_merged = receivals_base.merge(
    purchase_base,
    on = ["purchase_order_id", "purchase_order_item_no"],
    how = "inner"
    )



In [3]:
df = rec_ord_merged.copy()

for c in ["date_arrival", "created_date_time"]:
    df[c] = pd.to_datetime(df[c], errors="coerce", utc=True).dt.tz_localize(None)
    
df["delivery_year"]    = df["date_arrival"].dt.year
df["delivery_month"]   = df["date_arrival"].dt.month
df["delivery_weekday"] = df["date_arrival"].dt.weekday
df["lead_time_days"] = (df["date_arrival"] - df["created_date_time"]).dt.days



In [4]:
# Feature engineering uten lekkasje (fjerner target leakage: 'net_weight' og avledet 'delivery_ratio')
# Tidligere features inkluderte net_weight (målvariabelen) og delivery_ratio (avhenger av net_weight) -> ga overfitting og for høye prediksjoner.

# Behold kun variabler som er kjent ved bestillingstidspunkt / før leveransens faktiske vekt er kjent.
# (Hvis 'lead_time_days' brukes i sanntid må den være basert på informasjon tilgjengelig ved prediksjon; beholder den her.)

weight_diff = df["net_weight"] - df["quantity"]  # kun for analyse (ikke feature)

# Ny, ren featureliste
features = [
    'rm_id',
    'product_id',
    'quantity',
    'delivery_year',
    'delivery_month',
    'delivery_weekday',
    'lead_time_days'
]

X = df[features].copy()
Y = df["net_weight"].values  # target

train_mask = df["date_arrival"] < pd.Timestamp("2024-01-01")
X_train, Y_train = X[train_mask], Y[train_mask]
X_test,  Y_test  = X[~train_mask], Y[~train_mask]

test = df.loc[~train_mask].copy()

print("Train/Test shapes:", X_train.shape, X_test.shape)
print("Head features:\n", X_train.head())

Train/Test shapes: (116169, 7) (5993, 7)
Head features:
    rm_id  product_id   quantity  delivery_year  delivery_month  \
0  365.0  91900143.0  1975000.0           2004               6   
1  365.0  91900143.0  1975000.0           2004               6   
2  365.0  91900143.0  1500000.0           2004               6   
3  365.0  91900143.0  1500000.0           2004               6   
4  379.0  91900296.0   125000.0           2004               6   

   delivery_weekday  lead_time_days  
0                 1             153  
1                 1             153  
2                 1             158  
3                 1             158  
4                 1             -10  


In [8]:
# Revidert XGBoost kvantilmodell (q=0.2) – manuell early stopping (feval ikke støttet i miljøet)
q = 0.2

import numpy as np
import pandas as pd
import xgboost as xgb
import traceback, math, copy

print("[Info] xgboost version:", xgb.__version__)

# --- Robust pre-flight diagnostikk og rensing ---

def _clean_features(df_in: pd.DataFrame, name: str):
    df = df_in.copy()
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for c in df.columns:
        if df[c].dtype == 'object':
            conv = pd.to_numeric(df[c], errors='coerce')
            if conv.isna().all():
                codes, _ = pd.factorize(df[c].astype(str))
                df[c] = codes.astype('int32')
            else:
                df[c] = conv
        elif str(df[c].dtype).startswith('Int'):
            df[c] = df[c].astype('float32')
    nan_total = int(df.isna().sum().sum())
    if nan_total > 0:
        df.fillna(-1, inplace=True)
    df = df.astype('float32')
    print(f"[Diag] {name}: shape={df.shape}, nan_filled={nan_total}, dtypes_ok={all(np.issubdtype(t, np.number) for t in df.dtypes)}")
    return df

X_train_clean = _clean_features(X_train, 'train')
X_test_clean  = _clean_features(X_test, 'test')

GRAD_DAMP = 0.8

def quantile_objective(y_pred: np.ndarray, dtrain: xgb.DMatrix):
    y_true = dtrain.get_label()
    diff = y_true - y_pred
    base_grad = np.where(diff > 0, -q, 1 - q).astype(np.float32)
    grad = base_grad * GRAD_DAMP
    hess = np.full_like(grad, 5e-6, dtype=np.float32)
    return grad, hess

# Eval-funksjon brukt i manuell loop

def quantile_loss(y_true, y_hat, q=0.2):
    d = y_true - y_hat
    return float(np.mean(np.where(d > 0, q * d, (1 - q) * (-d))))

# Sample weights
p85 = np.percentile(Y_train, 85)
p95 = np.percentile(Y_train, 95)
sample_weight = np.ones_like(Y_train, dtype=float)
sample_weight[Y_train > p85] = 0.7
sample_weight[Y_train > p95] = 0.5

train_mat = xgb.DMatrix(X_train_clean.values, label=Y_train, weight=sample_weight, feature_names=list(X_train_clean.columns))
valid_mat = xgb.DMatrix(X_test_clean.values,  label=Y_test,  feature_names=list(X_test_clean.columns))

params = {
    "eta": 0.03,
    "max_depth": 5,
    "min_child_weight": 11,
    "subsample": 0.65,
    "colsample_bytree": 0.65,
    "gamma": 0.6,
    "lambda": 5.0,
    "alpha": 1.0,
    "tree_method": "hist",
    "objective": "reg:squarederror",  # placeholder
    "base_score": float(np.quantile(Y_train, q) * 0.95),
    "seed": 42,
}

num_round = 1600
patience = 120
best_iter = -1
best_loss = math.inf
best_raw_model = None
val_history = []
train_history = []
no_improve = 0

# Manuell boosting loop (1 iterasjon av gangen) for å kunne evaluere kvantiltap eksplisitt
bst = None
for i in range(num_round):
    try:
        bst = xgb.train(
            params=params,
            dtrain=train_mat,
            num_boost_round=1,
            xgb_model=bst,
            obj=quantile_objective,
            verbose_eval=False,
        )
    except Exception as e:
        print(f"[FEIL] Iterasjon {i} under trening:", e)
        traceback.print_exc()
        raise

    # Prediksjoner for eval
    pred_val = bst.predict(valid_mat)
    pred_trn = bst.predict(train_mat)
    loss_val = quantile_loss(Y_test, pred_val, q)
    loss_trn = quantile_loss(Y_train, pred_trn, q)
    val_history.append(loss_val)
    train_history.append(loss_trn)

    # Early stopping logikk
    if loss_val + 1e-10 < best_loss:  # liten margin for numerisk støy
        best_loss = loss_val
        best_iter = i
        best_raw_model = bst.save_raw()
        no_improve = 0
    else:
        no_improve += 1

    if (i + 1) % 100 == 0 or i == 0:
        print(f"Iter {i:4d} | val_q20_loss={loss_val:.6f} | train_q20_loss={loss_trn:.6f} | best={best_loss:.6f} (iter {best_iter})")
    if no_improve >= patience:
        print(f"[EarlyStopping] Ingen forbedring siste {patience} runder. Stopper ved iter {i}.")
        break

if best_raw_model is not None and best_iter != i:
    # Last tilbake beste modelltilstand
    bst = xgb.Booster(params=params)
    bst.load_model(bytearray(best_raw_model))

print(f"Best iteration: {best_iter}  (val_q20_loss={best_loss:.6f})")

# Prediksjon med beste modell
try:
    y_pred_raw = bst.predict(valid_mat)
except Exception as e:
    print("[FEIL] Under slutt-prediksjon:", e)
    traceback.print_exc()
    raise

BIAS_SHIFT = 0.985
y_pred = y_pred_raw * BIAS_SHIFT
upper_cap = np.percentile(Y_train, 99.7)
y_pred = np.clip(y_pred, 0, upper_cap)

final_val_loss = quantile_loss(Y_test, y_pred, q)
print(f"Revidert modell – kvantil-tap q={q}: {final_val_loss:.6f}")

baseline_const = np.full_like(Y_test, np.quantile(Y_train, q))
print("Baseline (konstant) q-loss:", quantile_loss(Y_test, baseline_const, q=q))
if 'quantity' in X_test.columns:
    baseline_quantity = 0.85 * X_test['quantity'].values
    print("Baseline (0.85*quantity) q-loss:", quantile_loss(Y_test, baseline_quantity, q=q))

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
print("RMSE (sekundær):", rmse)

under_ratio = (y_pred <= Y_test).mean()
print(f"Andel prediksjoner <= faktisk vekt: {under_ratio:.3f}")

# Siste 5 val-loss
print("Siste 5 val_q20_loss:", val_history[-5:])


[Info] xgboost version: 3.0.1
[Diag] train: shape=(116169, 7), nan_filled=0, dtypes_ok=True
[Diag] test: shape=(5993, 7), nan_filled=0, dtypes_ok=True
Iter    0 | val_q20_loss=2592.805707 | train_q20_loss=2219.157409 | best=2592.805707 (iter 0)
Iter   99 | val_q20_loss=2592.805707 | train_q20_loss=2219.157409 | best=2592.805707 (iter 0)
Iter   99 | val_q20_loss=2592.805707 | train_q20_loss=2219.157409 | best=2592.805707 (iter 0)
[EarlyStopping] Ingen forbedring siste 120 runder. Stopper ved iter 120.
Best iteration: 0  (val_q20_loss=2592.805707)
Revidert modell – kvantil-tap q=0.2: 2592.072204
Baseline (konstant) q-loss: 2596.3913899549475
Baseline (0.85*quantity) q-loss: 267352.41462205903
RMSE (sekundær): 13728.278004728034
Andel prediksjoner <= faktisk vekt: 0.789
Siste 5 val_q20_loss: [2592.8057066577676, 2592.8057066577676, 2592.8057066577676, 2592.8057066577676, 2592.8057066577676]
[EarlyStopping] Ingen forbedring siste 120 runder. Stopper ved iter 120.
Best iteration: 0  (val_q2

In [10]:
# Alternativ kvantil-modell: GradientBoostingRegressor (sklearn) + enkel ratio-baseline
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import pandas as pd

print("[AltModel] Starter treningsløp for sklearn GradientBoostingRegressor (quantile 0.2)")

# Sikkerhetskopi / rensing dersom ikke kjørt tidligere
if 'X_train_clean' not in globals():
    def _quick_clean(df_in):
        df = df_in.copy()
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        num = df.select_dtypes(include=[np.number])
        if num.shape[1] != df.shape[1]:
            # Bruk faktoriser for ikke-numeriske
            for c in df.columns:
                if c not in num.columns:
                    codes, _ = pd.factorize(df[c].astype(str))
                    df[c] = codes
        df.fillna(-1, inplace=True)
        return df.astype('float32')
    X_train_clean = _quick_clean(X_train)
    X_test_clean  = _quick_clean(X_test)

# GBR med kvantil-loss
gbr = GradientBoostingRegressor(
    loss='quantile',
    alpha=0.2,
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=4,
    min_samples_leaf=60,  # glatter
    subsample=0.7,
    random_state=42
)

gbr.fit(X_train_clean, Y_train)

pred_gbr_raw = gbr.predict(X_test_clean)
BIAS_SHIFT_GBR = 0.985  # samme konservative justering
pred_gbr = np.clip(pred_gbr_raw * BIAS_SHIFT_GBR, 0, np.percentile(Y_train, 99.5))

# Kvantil-tap funksjon

def quantile_loss(y_true, y_hat, q=0.2):
    d = y_true - y_hat
    return float(np.mean(np.where(d > 0, q * d, (1 - q) * (-d))))

q_gbr = quantile_loss(Y_test, pred_gbr, q=0.2)
print(f"[AltModel] GBR kvantil-tap q=0.2: {q_gbr:.6f}")

baseline_const = np.full_like(Y_test, np.quantile(Y_train, 0.2))
print("[AltModel] Baseline (konstant) q-loss:", quantile_loss(Y_test, baseline_const, q=0.2))

if 'quantity' in X_test.columns:
    baseline_quantity = 0.85 * X_test['quantity'].values
    print("[AltModel] Baseline (0.85*quantity) q-loss:", quantile_loss(Y_test, baseline_quantity, q=0.2))

under_ratio_gbr = (pred_gbr <= Y_test).mean()
print(f"[AltModel] Andel prediksjoner <= faktisk vekt: {under_ratio_gbr:.3f}")

# Enkel produkt-basert ratio-baseline (q20 av (net_weight/quantity) * framtidig quantity)
print("[AltModel] Beregner produkt-basert ratio-baseline...")
train_df_tmp = X_train.copy()
train_df_tmp['net_weight'] = Y_train
# Unngå deling på null
safe_quantity = np.clip(train_df_tmp['quantity'].values, 1e-6, None)
ratio = train_df_tmp['net_weight'].values / safe_quantity
train_df_tmp['ratio'] = ratio
ratio_q20_per_prod = train_df_tmp.groupby('product_id')['ratio'].quantile(0.2)

def ratio_predict(test_frame):
    r = ratio_q20_per_prod.reindex(test_frame['product_id']).values
    # fallback global q20 dersom NaN
    global_q20 = np.quantile(ratio, 0.2)
    r = np.where(np.isnan(r), global_q20, r)
    return test_frame['quantity'].values * r

ratio_pred = ratio_predict(X_test)
ratio_pred = np.clip(ratio_pred, 0, np.percentile(Y_train, 99.5))
q_ratio = quantile_loss(Y_test, ratio_pred, q=0.2)
print(f"[AltModel] Produkt-ratio baseline q20-loss: {q_ratio:.6f}")
print(f"[AltModel] Ratio underestimeringsandel: {(ratio_pred <= Y_test).mean():.3f}")

# Feature importance (GBR)
fi = gbr.feature_importances_
fi_map = {col: round(float(val), 5) for col, val in zip(X_train_clean.columns, fi)}
print("[AltModel] GBR feature_importances_:", fi_map)

# Velg hvilken prediksjon som skal brukes nedstrøms (sett y_pred_gbr som standard)
y_pred_gbr = pred_gbr  # kan refereres i scoring-cell hvis ønskelig


[AltModel] Starter treningsløp for sklearn GradientBoostingRegressor (quantile 0.2)
[AltModel] GBR kvantil-tap q=0.2: 1269.060038
[AltModel] Baseline (konstant) q-loss: 2596.3913899549475
[AltModel] Baseline (0.85*quantity) q-loss: 267352.41462205903
[AltModel] Andel prediksjoner <= faktisk vekt: 0.800
[AltModel] Beregner produkt-basert ratio-baseline...
[AltModel] Produkt-ratio baseline q20-loss: 2267.711037
[AltModel] Ratio underestimeringsandel: 0.638
[AltModel] GBR feature_importances_: {'rm_id': 0.10426, 'product_id': 0.14675, 'quantity': 0.59104, 'delivery_year': 0.05711, 'delivery_month': 0.01104, 'delivery_weekday': 0.0052, 'lead_time_days': 0.0846}


In [11]:
from kaggle_metric import score, ParticipantVisibleError


receivals["date_arrival"] = pd.to_datetime(receivals["date_arrival"], errors="coerce", utc=True).dt.tz_localize(None)


start = pd.Timestamp("2024-01-01")
end   = pd.Timestamp("2024-05-31 23:59:59")
solution = (receivals.loc[(receivals["date_arrival"] >= start) & (receivals["date_arrival"] <= end)]
            .groupby("rm_id", as_index=False)
            .agg(weight=("net_weight", "sum"))
           ).rename(columns={"rm_id": "ID"})


rm_col = "rm_id_raw" if "rm_id_raw" in test.columns else "rm_id"

preds = pd.DataFrame({
    "ID": test[rm_col].values,
    "predicted_weight": np.clip(y_pred_gbr, 0, None)  
})

submission = preds.groupby("ID", as_index=False).agg(predicted_weight=("predicted_weight", "sum"))


submission = solution[["ID"]].merge(submission, on="ID", how="left")
submission["predicted_weight"] = submission["predicted_weight"].fillna(0.0)


try:
    final_score = score(solution=solution, submission=submission, row_id_column_name="ID")
    print("Quantile loss (q=0.2) – backtest jan–mai 2024:", final_score)
except ParticipantVisibleError as e:
    print("Scoring feilet:", e)
    

print(submission.head())
print(solution.head())

Quantile loss (q=0.2) – backtest jan–mai 2024: 450569.7955831535
       ID  predicted_weight
0  2124.0      1.943117e+04
1  2125.0      3.163805e+04
2  2129.0      1.015021e+05
3  2130.0      8.532402e+06
4  2131.0      1.336900e+05
       ID     weight
0  2124.0     7560.0
1  2125.0    25000.0
2  2129.0    69980.0
3  2130.0  3549704.0
4  2131.0   237344.0
