#**First Step: Data cleaning**

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

# Read
transactions = pd.read_csv("transactions.csv")
clients = pd.read_csv("clients.csv")
products = pd.read_csv("products.csv")
stocks = pd.read_csv("stocks.csv")
stores = pd.read_csv("stores.csv")

# Convert IDs to strings
transactions['ClientID'] = transactions['ClientID'].astype(str)
transactions['ProductID'] = transactions['ProductID'].astype(str)
transactions['StoreID']  = transactions['StoreID'].astype(str)

clients['ClientID'] = clients['ClientID'].astype(str)
products['ProductID'] = products['ProductID'].astype(str)
stores['StoreID'] = stores['StoreID'].astype(str)
stocks['ProductID'] = stocks['ProductID'].astype(str)


fatal: destination path 'Hackthon-Eleven' already exists and is not an empty directory.
/content/Hackthon-Eleven
clients.csv  Hackthon-Eleven	products.csv  stores.csv       transactions.csv
code.ipynb   itemcf_recall.csv	README.md     test_clean.csv
demo	     master_train.csv	stocks.csv    train_clean.csv


In [None]:
def basic_check(df, name):
    print(f"===== {name} =====")
    print("Shape:", df.shape)
    print("Missing values:")
    print(df.isna().sum())
    print("Duplicates:", df.duplicated().sum())
    print()

basic_check(transactions, "transactions")
basic_check(clients, "clients")
basic_check(products, "products")
basic_check(stocks, "stocks")
basic_check(stores, "stores")


===== transactions =====
Shape: (1177175, 6)
Missing values:
ClientID               0
ProductID              0
SaleTransactionDate    0
StoreID                0
Quantity               0
SalesNetAmountEuro     0
dtype: int64
Duplicates: 5121

===== clients =====
Shape: (424037, 7)
Missing values:
ClientID                 0
ClientSegment            0
ClientCountry            0
ClientOptINEmail         0
ClientOptINPhone         0
ClientGender         60795
Age                 304075
dtype: int64
Duplicates: 0

===== products =====
Shape: (47458, 5)
Missing values:
ProductID       0
Category        0
FamilyLevel1    0
FamilyLevel2    0
Universe        0
dtype: int64
Duplicates: 0

===== stocks =====
Shape: (16024, 3)
Missing values:
StoreCountry    0
ProductID       0
Quantity        0
dtype: int64
Duplicates: 0

===== stores =====
Shape: (606, 2)
Missing values:
StoreID         0
StoreCountry    0
dtype: int64
Duplicates: 0



In [None]:
# Remove duplicate values
transactions = transactions.drop_duplicates()
clients = clients.drop_duplicates()
products = products.drop_duplicates()

In [None]:
# Filter abnormal transactions
# Amount > 0, Quantity > 0
transactions = transactions[
    (transactions['SalesNetAmountEuro'] > 0) &
    (transactions['Quantity'] > 0)
]

In [None]:
clients["ClientGender"].value_counts(normalize=True).head(10)

Unnamed: 0_level_0,proportion
ClientGender,Unnamed: 1_level_1
F,0.659805
M,0.336442
C,0.002602
N,0.000595
U,0.000556


In [None]:
special = {"C","N","U"}

# Only keep the normal client ID
normal_ids = set(
    clients.loc[~clients["ClientGender"].isin(special), "ClientID"].astype(str)
)

transactions = transactions.copy()
transactions["ClientID"] = transactions["ClientID"].astype(str)
transactions = transactions[transactions["ClientID"].isin(normal_ids)].copy()

In [None]:
clients['ClientGender'] = clients['ClientGender'].fillna('Unknown')

In [None]:
clients['age_missing'] = clients['Age'].isna().astype(int)

In [None]:
group_median = clients.groupby(
    ['ClientSegment','ClientCountry']
)['Age'].median()

global_median = clients['Age'].median()

In [None]:
def fill_age(row):
    if pd.isna(row['Age']):
        key = (row['ClientSegment'], row['ClientCountry'])
        val = group_median.get(key, global_median)
        return global_median if pd.isna(val) else val
    return row['Age']

clients['Age_filled'] = clients.apply(fill_age, axis=1)

In [None]:
basic_check(clients, "clients")

===== clients =====
Shape: (424037, 9)
Missing values:
ClientID                 0
ClientSegment            0
ClientCountry            0
ClientOptINEmail         0
ClientOptINPhone         0
ClientGender             0
Age                 304075
age_missing              0
Age_filled               0
dtype: int64
Duplicates: 0



In [None]:
transactions['SaleTransactionDate'] = pd.to_datetime(
    transactions['SaleTransactionDate']
)

In [None]:
# Use the 80th percentile of transaction dates as the cutoff for time-based data splitting
split_date = transactions['SaleTransactionDate'].quantile(0.8)
print(split_date)

2024-11-26 00:00:00+00:00


In [None]:
train = transactions[
    transactions['SaleTransactionDate'] <= split_date
]

test = transactions[
    transactions['SaleTransactionDate'] > split_date
]

In [None]:
train_users = set(train['ClientID'])
test = test[test['ClientID'].isin(train_users)]

In [None]:
# Merge store information into the transactions table
transactions = transactions.merge(
    stores,
    on="StoreID",
    how="left"
)

In [None]:
# Build the master dataset by merging client, product, store,
# and stock information into the training data
master = train.merge(
    clients,
    on="ClientID",
    how="left"
).merge(
    products,
    on="ProductID",
    how="left"
).merge(
    stores,
    on="StoreID",
    how="left"
).merge(
    stocks,
    on=["ProductID","StoreCountry"],
    how="left"
)


In [None]:
train.to_csv("train_clean.csv", index=False)
test.to_csv("test_clean.csv", index=False)
master.to_csv("master_train.csv", index=False)

# **Second：Recall**

## ItemCF

In [None]:
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.preprocessing import normalize

# =========================
# ItemCF: build (1) user->items (2) item->topK similar
# Input: train DataFrame with columns: ClientID, ProductID
# Optional columns: Quantity, SalesNetAmountEuro
# =========================

def build_user_items(train: pd.DataFrame,
                     user_col: str = "ClientID",
                     item_col: str = "ProductID",
                     weight_col: str | None = "Quantity",
                     date_col: str | None = None,
                     split_date: pd.Timestamp | None = None,
                     lambda_: float = 0.0,
                     implicit: bool = True):
    """
    Returns:
      user_items: dict[user] -> dict[item] -> weight
      user2idx, item2idx, idx2user, idx2item
      ui_sparse: normalized item vectors (items x users) if needed later
    """
    date_col = "SaleTransactionDate"

    df = train[
        [user_col, item_col]
        + ([weight_col] if (weight_col and weight_col in train.columns) else [])
        + ([date_col] if date_col in train.columns else [])    ].copy()

        # Ensure string IDs (avoids join/key issues)
    df[user_col] = df[user_col].astype(str)
    df[item_col] = df[item_col].astype(str)

    # Build interaction weights
    # Modify the weight computation part inside build_user_item

    if weight_col and weight_col in df.columns:
      w = df[weight_col].fillna(1.0).astype(float)
      if implicit:
          w = np.log1p(w)

      # Time decay weighting
      df["days_diff"] = (split_date - df["SaleTransactionDate"]).dt.days
      lambda_ = 0.01
      time_weight = np.exp(-lambda_ * df["days_diff"])

      w = w * time_weight

      df["_w"] = w

    # Aggregate duplicates (same user-item multiple transactions)
    df = df.groupby([user_col, item_col], as_index=False)["_w"].sum()

    # Index mapping
    users = df[user_col].unique()
    items = df[item_col].unique()
    user2idx = {u: i for i, u in enumerate(users)}
    item2idx = {it: i for i, it in enumerate(items)}
    idx2user = {i: u for u, i in user2idx.items()}
    idx2item = {i: it for it, i in item2idx.items()}

    # Sparse matrix (users x items) => we'll use (items x users) for item-item sim
    row_u = df[user_col].map(user2idx).to_numpy()
    col_i = df[item_col].map(item2idx).to_numpy()
    data = df["_w"].to_numpy()

    # User-Item matrix: U x I
    UI = coo_matrix((data, (row_u, col_i)), shape=(len(users), len(items))).tocsr()

    # Build user_items dict (hash table #1)
    user_items = defaultdict(dict)
    # CSR iteration
    UI_csr = UI.tocsr()
    for u_idx in range(UI_csr.shape[0]):
        start, end = UI_csr.indptr[u_idx], UI_csr.indptr[u_idx + 1]
        item_idxs = UI_csr.indices[start:end]
        weights = UI_csr.data[start:end]
        u = idx2user[u_idx]
        for it_idx, wgt in zip(item_idxs, weights):
            user_items[u][idx2item[it_idx]] = float(wgt)

    return user_items, user2idx, item2idx, idx2user, idx2item, UI


def build_item_sim_topk(UI,
                        idx2item: dict[int, str],
                        topk: int = 50,
                        shrink: float = 0.0):
    """
    Build item->topK similar items (hash table #2) using cosine similarity on co-occurrence.
    UI: sparse matrix (users x items)
    Returns:
      item_sim: dict[item] -> list[(sim_item, sim_score)] sorted desc
    """
    # Item vectors: I x U
    IU = UI.T.tocsr()

    # L2 normalize item vectors => cosine similarity via dot product
    IU_norm = normalize(IU, norm="l2", axis=1, copy=True)

    # Cosine sim matrix (sparse-ish)
    # NOTE: This can be heavy if items are huge; for coursework usually OK.
    S = (IU_norm @ IU_norm.T).tocsr()

    # Optional shrinkage to reduce popularity bias (simple heuristic):
    # sim' = sim * (co_count / (co_count + shrink))
    # Here we approximate co_count with raw dot of binary matrix if needed.
    # If you don't need it, keep shrink=0.
    if shrink > 0:
        # binary co-occurrence count
        IU_bin = IU.copy()
        IU_bin.data = np.ones_like(IU_bin.data)
        C = (IU_bin @ IU_bin.T).tocsr()
    else:
        C = None

    item_sim = {}
    n_items = S.shape[0]

    for i in range(n_items):
        # get row i similarities
        start, end = S.indptr[i], S.indptr[i + 1]
        j_idxs = S.indices[start:end]
        sims = S.data[start:end].astype(float)

        # remove self
        mask = j_idxs != i
        j_idxs = j_idxs[mask]
        sims = sims[mask]

        if shrink > 0 and C is not None:
            c_start, c_end = C.indptr[i], C.indptr[i + 1]
            c_j = C.indices[c_start:c_end]
            c_v = C.data[c_start:c_end].astype(float)
            # map co-counts for row i
            co_map = {jj: cv for jj, cv in zip(c_j, c_v)}
            sims = np.array([s * (co_map.get(jj, 0.0) / (co_map.get(jj, 0.0) + shrink))
                             for jj, s in zip(j_idxs, sims)], dtype=float)

        if sims.size == 0:
            item_sim[idx2item[i]] = []
            continue

        # topK
        if sims.size > topk:
            top_idx = np.argpartition(-sims, topk - 1)[:topk]
            j_top = j_idxs[top_idx]
            s_top = sims[top_idx]
            order = np.argsort(-s_top)
            j_top = j_top[order]
            s_top = s_top[order]
        else:
            order = np.argsort(-sims)
            j_top = j_idxs[order]
            s_top = sims[order]

        item_sim[idx2item[i]] = [(idx2item[j], float(s)) for j, s in zip(j_top, s_top)]

    return item_sim


def recommend_itemcf(user_id: str,
                     user_items: dict,
                     item_sim: dict,
                     topn: int = 10,
                     exclude_seen: bool = True,
                     stock_ok_items: set[str] | None = None):
    """
    Recommend TopN items for a user using precomputed item_sim.
    stock_ok_items: optional set of ProductIDs that are in stock (filter candidates).
    """
    user_id = str(user_id)
    history = user_items.get(user_id, {})
    if not history:
        return []  # cold user: handle separately with popularity recall

    scores = defaultdict(float)

    for it, w in history.items():
        for sim_it, sim in item_sim.get(it, []):
            if exclude_seen and sim_it in history:
                continue
            if stock_ok_items is not None and sim_it not in stock_ok_items:
                continue
            scores[sim_it] += sim * w

    if not scores:
        return []

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topn]
    # return list of dicts for easy DataFrame creation
    return [{"ClientID": user_id, "ProductID": pid, "Score": sc, "Rank": r+1}
            for r, (pid, sc) in enumerate(ranked)]


# =========================
# Example usage
# =========================
# 1) Build user_items + sparse UI
# user_items, user2idx, item2idx, idx2user, idx2item, UI = build_user_items(train, weight_col="Quantity")

# 2) Build item->topK similar
# item_sim = build_item_sim_topk(UI, idx2item, topk=50, shrink=10.0)

# 3) Optional stock filter (if you have stocks by country, you might build stock_ok_items per user-country)
# stock_ok_items = set(stocks[stocks["Quantity"] > 0]["ProductID"].astype(str))

# 4) Recommend for one user
# recs = recommend_itemcf("123", user_items, item_sim, topn=10, stock_ok_items=stock_ok_items)
# pd.DataFrame(recs)

# 5) Recommend for many users (e.g., all users in test)
def batch_recommend_itemcf(users,
                           user_items,
                           item_sim,
                           topn: int = 10,
                           stock_ok_items: set[str] | None = None):
    out = []
    for u in users:
        out.extend(recommend_itemcf(u, user_items, item_sim, topn=topn,
                                    exclude_seen=True, stock_ok_items=stock_ok_items))
    return pd.DataFrame(out)


In [None]:
user_items, user2idx, item2idx, idx2user, idx2item, UI = build_user_items(
    train,
    user_col="ClientID",
    item_col="ProductID",
    weight_col="Quantity",
    date_col="SaleTransactionDate",
    split_date=split_date,
    lambda_=0.01,
    implicit=True
)

In [None]:
len(user_items)
len(item2idx)
UI.shape

(233201, 27733)

In [None]:
item_sim = build_item_sim_topk(
    UI,
    idx2item=idx2item,
    topk=50,
    shrink=10.0
)

In [None]:
len(item_sim)

27733

In [None]:
recall_users = train["ClientID"].astype(str).unique()
len(recall_users)

233201

In [None]:
itemcf_recall_df = batch_recommend_itemcf(
    users=recall_users,
    user_items=user_items,
    item_sim=item_sim,
    topn=100,
    stock_ok_items=None
)

In [None]:
itemcf_recall_df.head()

Unnamed: 0,ClientID,ProductID,Score,Rank
0,8119209481417068505,2574451597753930800,0.005512,1
1,8119209481417068505,2241128212929294900,0.004892,2
2,8119209481417068505,1547948263425675420,0.004724,3
3,8119209481417068505,4679812812492781009,0.004488,4
4,8119209481417068505,6626373566554299428,0.004214,5


In [None]:
itemcf_recall_df.shape

(17181578, 4)

In [None]:
itemcf_recall_df["RecallSource"] = "ItemCF"

In [None]:
itemcf_recall_df.to_csv(
    "itemcf_recall.csv",
    index=False
)

## UserCF

In [None]:
def build_user_sim_topk(UI,
                        idx2user: dict[int, str],
                        topk: int = 50,
                        shrink: float = 0.0):
    """
    Build user->topK similar users using cosine similarity.
    UI: sparse matrix (users x items)
    Returns:
      user_sim: dict[user] -> list[(sim_user, sim_score)]
    """
    # L2 normalize user vectors
    U_norm = normalize(UI, norm="l2", axis=1, copy=True)

    # Cosine similarity
    S = (U_norm @ U_norm.T).tocsr()

    # Optional shrinkage (reduce popularity bias)
    if shrink > 0:
        UI_bin = UI.copy()
        UI_bin.data = np.ones_like(UI_bin.data)
        C = (UI_bin @ UI_bin.T).tocsr()
    else:
        C = None

    user_sim = {}
    n_users = S.shape[0]

    for u in range(n_users):
        start, end = S.indptr[u], S.indptr[u + 1]
        v_idxs = S.indices[start:end]
        sims = S.data[start:end].astype(float)

        # remove self
        mask = v_idxs != u
        v_idxs = v_idxs[mask]
        sims = sims[mask]

        if shrink > 0 and C is not None:
            c_start, c_end = C.indptr[u], C.indptr[u + 1]
            c_j = C.indices[c_start:c_end]
            c_v = C.data[c_start:c_end].astype(float)
            co_map = {jj: cv for jj, cv in zip(c_j, c_v)}
            sims = np.array([s * (co_map.get(v, 0.0) / (co_map.get(v, 0.0) + shrink))
                             for v, s in zip(v_idxs, sims)], dtype=float)

        if sims.size == 0:
            user_sim[idx2user[u]] = []
            continue

        # TopK neighbors
        if sims.size > topk:
            top_idx = np.argpartition(-sims, topk - 1)[:topk]
            v_top = v_idxs[top_idx]
            s_top = sims[top_idx]
            order = np.argsort(-s_top)
            v_top = v_top[order]
            s_top = s_top[order]
        else:
            order = np.argsort(-sims)
            v_top = v_idxs[order]
            s_top = sims[order]

        user_sim[idx2user[u]] = [(idx2user[v], float(s)) for v, s in zip(v_top, s_top)]

    return user_sim


In [None]:
def usercf_recall_for_user(user_id: str,
                           user_items: dict,
                           user_sim: dict,
                           topk: int = 100,
                           exclude_seen: bool = True):
    """
    UserCF recall for single user
    """
    history = user_items.get(user_id, {})
    if not history:
        return []

    scores = defaultdict(float)

    # 遍历相似用户
    for sim_user, sim_score in user_sim.get(user_id, []):
        for item, w in user_items.get(sim_user, {}).items():
            if exclude_seen and item in history:
                continue
            scores[item] += sim_score * w

    if not scores:
        return []

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]

    return [
        {"ClientID": user_id, "ProductID": item, "Score": score}
        for item, score in ranked
    ]


In [None]:
def batch_recommend_usercf(users,
                           user_items,
                           user_sim,
                           topk: int = 100):
    out = []
    for u in users:
        out.extend(
            usercf_recall_for_user(
                user_id=str(u),
                user_items=user_items,
                user_sim=user_sim,
                topk=topk
            )
        )
    df = pd.DataFrame(out)
    if not df.empty:
        df["Rank"] = (
            df.groupby("ClientID")["Score"]
              .rank(method="first", ascending=False)
              .astype(int)
        )
    return df


In [None]:
user_sim = build_user_sim_topk(
    UI,
    idx2user=idx2user,
    topk=50,
    shrink=10.0
)

In [None]:
len(user_sim)

233201

In [None]:
recall_users = train["ClientID"].astype(str).unique()

usercf_recall_df = batch_recommend_usercf(
    users=recall_users,
    user_items=user_items,
    user_sim=user_sim,
    topk=100
)

In [None]:
usercf_recall_df.head()

Unnamed: 0,ClientID,ProductID,Score,Rank
0,8119209481417068505,2937449090161962334,0.120108,1
1,8119209481417068505,7768912807044871351,0.109089,2
2,8119209481417068505,9080873247010611020,0.107589,3
3,8119209481417068505,7426345339049544076,0.082164,4
4,8119209481417068505,6680614054888581139,0.079093,5


In [None]:
usercf_recall_df.shape

(10352831, 4)

In [None]:
usercf_recall_df["RecallSource"] = "UserCF"
usercf_recall_df.to_csv("usercf_recall.csv", index=False)

## Country recall

In [None]:
def build_country_popularity(train: pd.DataFrame,
                             stores: pd.DataFrame,
                             stocks: pd.DataFrame,
                             country_col_in_stocks: str = "StoreCountry",
                             user_col: str = "ClientID",
                             item_col: str = "ProductID",
                             store_col: str = "StoreID",
                             amount_col: str = "SalesNetAmountEuro",
                             qty_col: str = "Quantity"):
    """
    Returns:
      pop_df: DataFrame with columns [StoreCountry, ProductID, popularity]
      stock_ok: DataFrame with columns [StoreCountry, ProductID] where stock>0
    """
    df = train.copy()
    df[user_col] = df[user_col].astype(str)
    df[item_col] = df[item_col].astype(str)
    df[store_col] = df[store_col].astype(str)

    stores_ = stores.copy()
    stores_[store_col] = stores_[store_col].astype(str)

    # Enrich transactions with the country where the store is located
    df = df.merge(stores_[[store_col, country_col_in_stocks]], on=store_col, how="left")

    # Compute item popularity within each country
    # Priority: GMV > Quantity > Number of orders
    if amount_col in df.columns:
        pop_df = (df.groupby([country_col_in_stocks, item_col], as_index=False)[amount_col]
                    .sum()
                    .rename(columns={amount_col: "popularity"}))
    elif qty_col in df.columns:
        pop_df = (df.groupby([country_col_in_stocks, item_col], as_index=False)[qty_col]
                    .sum()
                    .rename(columns={qty_col: "popularity"}))
    else:
        pop_df = (df.groupby([country_col_in_stocks, item_col], as_index=False)
                    .size()
                    .rename(columns={"size": "popularity"}))

    # Construct the candidate item pool constrained by stock availability (country-level)
    st = stocks.copy()
    st[country_col_in_stocks] = st[country_col_in_stocks].astype(str)
    st[item_col] = st[item_col].astype(str)

    stock_ok = (st.groupby([country_col_in_stocks, item_col], as_index=False)["Quantity"]
                  .sum())
    stock_ok = stock_ok[stock_ok["Quantity"] > 0][[country_col_in_stocks, item_col]]

    return pop_df, stock_ok


In [None]:
def build_country_top_items(pop_df: pd.DataFrame,
                            stock_ok: pd.DataFrame,
                            country_col: str = "StoreCountry",
                            item_col: str = "ProductID",
                            topn: int = 200):
    """
    Returns:
      country_top: dict[country] -> list[(ProductID, popularity)]
    """
    pop_stock = pop_df.merge(stock_ok, on=[country_col, item_col], how="inner")

    # Get topN within each country
    pop_stock = pop_stock.sort_values([country_col, "popularity"], ascending=[True, False])
    pop_stock["rank"] = pop_stock.groupby(country_col).cumcount() + 1
    pop_stock = pop_stock[pop_stock["rank"] <= topn]

    country_top = (pop_stock.groupby(country_col)
                           .apply(lambda x: list(zip(x[item_col].astype(str), x["popularity"].astype(float))))
                           .to_dict())
    return country_top

In [None]:
def build_user_country(train: pd.DataFrame,
                       clients: pd.DataFrame,
                       stores: pd.DataFrame,
                       user_col: str = "ClientID",
                       store_col: str = "StoreID",
                       client_country_col: str = "ClientCountry",
                       store_country_col: str = "StoreCountry"):
    """
    Return: user_country Series indexed by ClientID -> country
    Priority: clients.ClientCountry, fallback: last purchase store country in train
    """
    c = clients.copy()
    c[user_col] = c[user_col].astype(str)
    c[client_country_col] = c[client_country_col].astype(str)

    # fallback: last purchase store country (if date not available, "last row" per user in current ordering)
    df = train.copy()
    df[user_col] = df[user_col].astype(str)
    df[store_col] = df[store_col].astype(str)
    st = stores.copy()
    st[store_col] = st[store_col].astype(str)

    df = df.merge(st[[store_col, store_country_col]], on=store_col, how="left")

    # df = df.sort_values([user_col, "TransactionDate"])
    last_country = df.groupby(user_col)[store_country_col].last()

    user_country = c.set_index(user_col)[client_country_col].copy()
    # Use last_country to fill missing or invalid country values in clients
    user_country = user_country.where(user_country.notna() & (user_country != "nan"), last_country)

    return user_country

In [None]:
def batch_store_recall(users,
                       user_country: pd.Series,
                       country_top: dict,
                       topn: int = 100,
                       recall_source: str = "StoreCountryPopular"):
    """
    Returns DataFrame: ClientID, ProductID, Score, Rank, RecallSource
    """
    out = []
    for u in users:
        u = str(u)
        country = user_country.get(u, None)
        if country is None or str(country) == "nan":
            continue
        country = str(country)

        items = country_top.get(country, [])
        if not items:
            continue

        # get topn
        items = items[:topn]
        for r, (pid, score) in enumerate(items, start=1):
            out.append({
                "ClientID": u,
                "ProductID": str(pid),
                "Score": float(score),
                "Rank": r,
                "RecallSource": recall_source
            })

    return pd.DataFrame(out)


In [None]:
pop_df, stock_ok = build_country_popularity(train=train, stores=stores, stocks=stocks)

In [None]:
country_top = build_country_top_items(pop_df, stock_ok, topn=200)

  .apply(lambda x: list(zip(x[item_col].astype(str), x["popularity"].astype(float))))


In [None]:
user_country = build_user_country(train=train, clients=clients, stores=stores)

In [None]:
recall_users = train["ClientID"].astype(str).unique()

In [None]:
store_recall_df = batch_store_recall(
    users=recall_users,
    user_country=user_country,
    country_top=country_top,
    topn=100,
    recall_source="StoreRecall_CountryPopular"
)

store_recall_df.head(), store_recall_df.shape

(              ClientID            ProductID     Score  Rank  \
 0  8119209481417068505  5626151450577313519  61519.67     1   
 1  8119209481417068505  8085696973862495176  17185.69     2   
 2  8119209481417068505  2061358648378984723  15732.20     3   
 3  8119209481417068505  6276160016396371863  11474.67     4   
 4  8119209481417068505  4228376913636946446  10226.08     5   
 
                  RecallSource  
 0  StoreRecall_CountryPopular  
 1  StoreRecall_CountryPopular  
 2  StoreRecall_CountryPopular  
 3  StoreRecall_CountryPopular  
 4  StoreRecall_CountryPopular  ,
 (21930433, 5))

In [None]:
store_recall_df.to_csv("store_recall.csv", index=False)

## Deduplicate and merge results from three recall strategies

In [None]:
test = pd.read_csv("test_clean.csv", usecols=["ClientID"])
test_users = set(test["ClientID"].astype(str).unique())
len(test_users)

43912

In [None]:
import csv
from collections import defaultdict
import heapq

def cap_recall_csv_stream(
    in_path: str,
    out_path: str,
    test_users: set,
    default_source: str,
    topk_per_user: int = 200,
    chunksize: int = 2_000_000,
):
    """
    Stream read recall CSV -> filter test_users -> keep topK per user -> write capped CSV
    Expect columns: ClientID, ProductID, Score (Rank/RecallSource optional)
    """

    # Per-user min-heap: store (score, product_id, source)
    heaps = defaultdict(list)

    reader = pd.read_csv(
        in_path,
        usecols=lambda c: c in {"ClientID","ProductID","Score","RecallSource"},
        chunksize=chunksize
    )

    for chunk in reader:
        # Type casting and column fallback
        if "RecallSource" not in chunk.columns:
            chunk["RecallSource"] = default_source

        chunk["ClientID"] = chunk["ClientID"].astype(str)
        chunk["ProductID"] = chunk["ProductID"].astype(str)
        chunk["RecallSource"] = chunk["RecallSource"].astype(str)
        chunk["Score"] = pd.to_numeric(chunk["Score"], errors="coerce").fillna(0.0).astype("float32")

        # Keep only users in test_users
        chunk = chunk[chunk["ClientID"].isin(test_users)]
        if chunk.empty:
            continue

        # Iterate over rows and maintain topK per user
        for u, pid, s, src in chunk[["ClientID","ProductID","Score","RecallSource"]].itertuples(index=False, name=None):
            h = heaps[u]
            item = (float(s), pid, src)
            if len(h) < topk_per_user:
                heapq.heappush(h, item)
            else:
                # Replace the heap root if the new score is larger
                if item[0] > h[0][0]:
                    heapq.heapreplace(h, item)

    # Write the capped file
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["ClientID","ProductID","Score","Rank","RecallSource"])
        for u, h in heaps.items():
            # Output in descending score order
            h_sorted = sorted(h, key=lambda x: -x[0])
            for r, (s, pid, src) in enumerate(h_sorted, start=1):
                w.writerow([u, pid, s, r, src])

    print(f"wrote {out_path} | users={len(heaps)}")


In [None]:
cap_recall_csv_stream(
    in_path="itemcf_recall.csv",
    out_path="itemcf_recall_capped.csv",
    test_users=test_users,
    default_source="ItemCF",
    topk_per_user=200
)

cap_recall_csv_stream(
    in_path="usercf_recall.csv",
    out_path="usercf_recall_capped.csv",
    test_users=test_users,
    default_source="UserCF",
    topk_per_user=200
)

cap_recall_csv_stream(
    in_path="store_recall.csv",
    out_path="store_recall_capped.csv",
    test_users=test_users,
    default_source="StoreRecall_CountryPopular",
    topk_per_user=200
)

✅ wrote itemcf_recall_capped.csv | users=43884
✅ wrote usercf_recall_capped.csv | users=33269
✅ wrote store_recall_capped.csv | users=43429


In [None]:
weights = {
    "ItemCF": 1.0,
    "UserCF": 1.0,
    "StoreRecall_CountryPopular": 0.6,
    "StoreRecall": 0.6,
}

In [None]:
def merge_capped(files, weights, topm=300, normalize_per_source=True):
    dfs = []
    for fp in files:
        d = pd.read_csv(fp, usecols=["ClientID","ProductID","Score","RecallSource"])
        d["ClientID"] = d["ClientID"].astype(str)
        d["ProductID"] = d["ProductID"].astype(str)
        d["RecallSource"] = d["RecallSource"].astype(str)
        d["Score"] = pd.to_numeric(d["Score"], errors="coerce").fillna(0.0).astype("float32")

        if normalize_per_source:
            mx = d.groupby(["ClientID","RecallSource"])["Score"].transform("max")
            d["nScore"] = np.where(mx>0, d["Score"]/mx, 0).astype("float32")
        else:
            d["nScore"] = d["Score"].astype("float32")

        d["w"] = d["RecallSource"].map(lambda s: weights.get(s, 1.0)).astype("float32")
        d["wScore"] = (d["nScore"] * d["w"]).astype("float32")
        dfs.append(d[["ClientID","ProductID","RecallSource","wScore"]])

    allc = pd.concat(dfs, ignore_index=True)

    agg = (allc.groupby(["ClientID","ProductID"], as_index=False)
              .agg(FinalScore=("wScore","sum"),
                   NumSources=("RecallSource","nunique"),
                   HitSources=("RecallSource", lambda x: "|".join(sorted(set(x))))))

    agg = agg.sort_values(["ClientID","FinalScore"], ascending=[True, False])
    agg["Rank"] = agg.groupby("ClientID").cumcount() + 1
    agg = agg[agg["Rank"] <= topm].reset_index(drop=True)
    return agg


In [None]:
candidates_df = merge_capped(
    files=["itemcf_recall_capped.csv","usercf_recall_capped.csv","store_recall_capped.csv"],
    weights=weights,
    topm=300,
    normalize_per_source=True
)

In [None]:
candidates_df.head(), candidates_df.shape

(              ClientID            ProductID  FinalScore  NumSources  \
 0  1000031093718265133  7885337332154947100    1.002546           2   
 1  1000031093718265133  5570841445184837742    1.000000           1   
 2  1000031093718265133  4264340979150525540    0.995175           1   
 3  1000031093718265133   290464850256493232    0.798072           1   
 4  1000031093718265133  3217339771092922763    0.698328           1   
 
       HitSources  Rank  
 0  ItemCF|UserCF     1  
 1         ItemCF     2  
 2         ItemCF     3  
 3         ItemCF     4  
 4         ItemCF     5  ,
 (9134067, 6))

In [None]:
candidates_df.to_csv("candidates_merged.csv", index=False)

# Fine-ranking stage



In [None]:
# Split transactions into train and test sets
test_users = test.copy()
test_users["ClientID"] = test_users["ClientID"].astype(str)
eligible_users = set(test_users["ClientID"].unique())
test_tx = transactions[
    (transactions["SaleTransactionDate"] >= split_date) &
    (transactions["ClientID"].isin(eligible_users))
].copy()

In [None]:
# test: user-item pairs that were actually purchased
# Assign label = 1 for these samples
test_ui = (
    test_tx[["ClientID", "ProductID"]]
    .drop_duplicates()
)
test_ui["label"] = 1

candidates_labeled = candidates_df.copy()

In [None]:
# All possible recall sources
all_sources = [
    "ItemCF",
    "UserCF",
    "StoreRecall_CountryPopular"
]
# Iterate over each recall source and generate one-hot features
for src in all_sources:
    candidates_labeled[f"src_{src}"] = (
        candidates_labeled["HitSources"].str.contains(src).astype(int)
    )

candidates_labeled["inv_rank"] = 1.0 / (candidates_labeled["Rank"] + 1)

In [None]:
# Calculate general product price
transactions = transactions.drop_duplicates()
transactions['unit_price'] = transactions['SalesNetAmountEuro'] / transactions['Quantity']

# Compute a representative price for each product
product_ref_price = transactions.groupby('ProductID')['unit_price'].median().reset_index()
product_ref_price.head()

def build_user_price_profile(train: pd.DataFrame,
                             user_col: str = "ClientID",
                             amount_col: str = "SalesNetAmountEuro"):
    """
    Returns: dict[user] -> (mean_price, std_price)
    """
    df = train.copy()
    df[user_col] = df[user_col].astype(str)

    # Aggregate per-user spending statistics
    profile_df = df.groupby(user_col)[amount_col].agg(['mean', 'std']).reset_index()
    profile_df['std'] = profile_df['std'].fillna(0) # If a user has only one purchase, std is NaN -> set it to 0

    # Convert to a dictionary for fast lookup in per-user recall functions
    user_price_profile = (profile_df.set_index(user_col)
                                    .apply(lambda x: (x['mean'], x['std']), axis=1)
                                    .to_dict())
    return user_price_profile

user_price_profile = build_user_price_profile(train)

In [None]:
candidates_labeled = candidates_df.copy()
candidates_labeled["u_avg_p"] = candidates_labeled["ClientID"].map(
    lambda x: user_price_profile[x][0] if x in user_price_profile else np.nan
)
product_ref_price["ProductID"] = product_ref_price["ProductID"].astype(str).str.strip()
prod_price_map = product_ref_price.set_index('ProductID')['unit_price'].to_dict()
candidates_labeled["i_p"] = candidates_labeled["ProductID"].map(prod_price_map)
# Build three core price features
# Feature A: Price ratio (strong signal: whether the item fits the user's spending level)
candidates_labeled["feat_price_ratio"] = candidates_labeled["i_p"] / (candidates_labeled["u_avg_p"] + 1e-9)

# Feature B: Inverse absolute price difference (distance-based similarity)
candidates_labeled["feat_price_score"] = 1.0 / (1.0 + np.abs(candidates_labeled["i_p"] - candidates_labeled["u_avg_p"]))

# Feature C: Raw price difference
candidates_labeled["feat_price_diff"] = candidates_labeled["i_p"] - candidates_labeled["u_avg_p"]



In [None]:
# All feature source
all_sources = [
    "ItemCF",
    "UserCF",
    "StoreRecall_CountryPopular"
]
# Iterate over each recall source and generate one-hot features
for src in all_sources:
    candidates_labeled[f"src_{src}"] = (
        candidates_labeled["HitSources"].str.contains(src).astype(int)
    )

candidates_labeled["inv_rank"] = 1.0 / (candidates_labeled["Rank"] + 1)

In [None]:
# Force ID columns in candidates_labeled to string type
candidates_labeled["ClientID"] = candidates_labeled["ClientID"].astype(str).str.strip()
candidates_labeled["ProductID"] = candidates_labeled["ProductID"].astype(str).str.strip()

# Force ID columns in test_ui to string type
test_ui["ClientID"] = test_ui["ClientID"].astype(str).str.strip()
test_ui["ProductID"] = test_ui["ProductID"].astype(str).str.strip()

# Join test_ui with the candidate set to identify which candidates
candidates_labeled = candidates_labeled.merge(
    test_ui,
    on=["ClientID", "ProductID"],
    how="left"
)
# Fill NaN values (candidates not matching real purchases) with 0 -> negative samples
candidates_labeled["label"] = candidates_labeled["label"].fillna(0).astype(int)

# Inspect the class imbalance (positive vs negative samples)
candidates_labeled["label"].value_counts()
candidates_labeled["label"].mean()

np.float64(0.001989365744744373)

In [None]:
# Separate positive and negative samples
pos = candidates_labeled[candidates_labeled["label"] == 1]
neg = candidates_labeled[candidates_labeled["label"] == 0]

# Downsample negative samples (e.g., 1:10 ratio)
neg_sampled = neg.sample(
    n=min(len(neg), len(pos) * 10),
    random_state=42
)
# Combine positive and sampled negative samples, then shuffle
train_rank = pd.concat([pos, neg_sampled], axis=0).sample(
    frac=1.0, random_state=42
)
# Check class distribution
print(train_rank["label"].value_counts())
print("positive ratio:", train_rank["label"].mean())

label
0    181710
1     18171
Name: count, dtype: int64
positive ratio: 0.09090909090909091


In [None]:
from sklearn.model_selection import GroupShuffleSplit

feature = [
    "FinalScore",
    "NumSources",
    "Rank",
    "inv_rank",
    "src_ItemCF",
    "src_UserCF",
    "src_StoreRecall_CountryPopular",
    "feat_price_ratio",
    "feat_price_score",
    "feat_price_diff"
]

X = train_rank[feature]
y = train_rank["label"]
# Multiple (user, item) samples from the same ClientID belong to the same group
groups = train_rank["ClientID"]

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Perform group-based train/validation split
train_idx, val_idx = next(gss.split(X, y, groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

In [None]:
def add_rerank_features(df: pd.DataFrame, feature_cols: list[str]) -> pd.DataFrame:
    out = df.copy()

    if "inv_rank" in feature_cols and "inv_rank" not in out.columns:
        out["inv_rank"] = 1.0 / (out["Rank"].astype(float) + 1.0)
    if "feat_price_ratio" in feature_cols and "feat_price_ratio" not in out.columns:
        out["u_avg_p"] = out["ClientID"].map(user_price_profile)
        out["i_p"] = out["ProductID"].map(prod_price_map)
        out["feat_price_ratio"] = out["i_p"] / (out["u_avg_p"] + 1e-9)
        out["feat_price_score"] = 1.0 / (1.0 + np.abs(out["i_p"] - out["u_avg_p"]))
        out["feat_price_diff"] = out["i_p"] - out["u_avg_p"]

        for c in ["feat_price_ratio", "feat_price_score", "feat_price_diff"]:
            out[c] = out[c].fillna(0.0 if "diff" in c else 1.0)

    # HitSources -> src_*
    if "HitSources" in out.columns:
        hs = out["HitSources"].astype(str)
        for c in feature_cols:
            if c.startswith("src_") and c not in out.columns:
                src = c.replace("src_", "")
                out[c] = hs.str.contains(src).astype(int)

    return out

In [None]:
def predict_score(model, X: pd.DataFrame) -> np.ndarray:
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    return model.predict(X)

In [None]:
def rerank_candidates(df: pd.DataFrame,
                      model,
                      feature_cols: list[str],
                      user_col: str = "ClientID",
                      score_col: str = "pred") -> pd.DataFrame:
    out = add_rerank_features(df, feature_cols)

    missing = [c for c in feature_cols if c not in out.columns]
    if missing:
        raise KeyError(f"Missing features: {missing}")

    out[score_col] = predict_score(model, out[feature_cols])
    out = out.sort_values([user_col, score_col], ascending=[True, False]).copy()
    return out

In [None]:
def eval_metrics_long(df, method_name, k=10,
                      user_col="ClientID", item_col="ProductID", label_col="label",
                      sort_col=None, ascending=False):

    d = df.copy()

    # Sort within each user:
    # - Recall stage: sort by Rank (ascending)
    # - Rerank stage: sort by prediction score (descending)
    if sort_col is not None:
        d = d.sort_values([user_col, sort_col], ascending=[True, ascending])

    topk = d.groupby(user_col, as_index=False).head(k)
    users = topk[user_col].unique()

    hit_cnt = (topk[topk[label_col] == 1]
               .groupby(user_col)[item_col].nunique()
               .reindex(users, fill_value=0))
    gt_cnt = (d[d[label_col] == 1]
              .groupby(user_col)[item_col].nunique()
              .reindex(users, fill_value=0))

    # Compute ranking metrics
    hitrate = (hit_cnt > 0).mean()
    precision = (hit_cnt / k).mean()

    # Recall
    recall_u = np.zeros(len(users))
    np.divide(hit_cnt.values, gt_cnt.values, out=recall_u, where=(gt_cnt.values > 0))
    recall = recall_u.mean()

    # Item coverage in top-K recommendations
    avg_hits = hit_cnt.mean()
    coverage = topk[item_col].nunique() / max(1, d[item_col].nunique())

    # NDCG
    def dcg(rel):
        return np.sum(rel / np.log2(np.arange(2, len(rel) + 2)))

    ndcgs = []
    for _, g in topk.groupby(user_col):
        rel = g[label_col].values.astype(float)
        if rel.sum() == 0:
            ndcgs.append(0.0)
        else:
            ideal = np.sort(rel)[::-1]
            ndcgs.append(dcg(rel) / dcg(ideal))
    ndcg = float(np.mean(ndcgs))

    rows = [
        {"Method": method_name, "Metric": "HitRate", "K": k, "Value": float(hitrate)},
        {"Method": method_name, "Metric": "Recall", "K": k, "Value": float(recall)},
        {"Method": method_name, "Metric": "Precision", "K": k, "Value": float(precision)},
        {"Method": method_name, "Metric": "NDCG", "K": k, "Value": float(ndcg)},
        {"Method": method_name, "Metric": "AvgHits", "K": k, "Value": float(avg_hits)},
        {"Method": method_name, "Metric": "Coverage", "K": k, "Value": float(coverage)},
        {"Method": method_name, "Metric": "Users", "K": k, "Value": int(len(users))},
    ]
    return pd.DataFrame(rows)

In [None]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc"
)

[LightGBM] [Info] Number of positive: 14578, number of negative: 145243
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 159821, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.091215 -> initscore=-2.298895
[LightGBM] [Info] Start training from score -2.298895


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    objective="binary:logistic",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc",
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)]
)

[0]	validation_0-auc:0.72783
[1]	validation_0-auc:0.73069
[2]	validation_0-auc:0.73151
[3]	validation_0-auc:0.73297
[4]	validation_0-auc:0.73509
[5]	validation_0-auc:0.73530
[6]	validation_0-auc:0.73540
[7]	validation_0-auc:0.73604
[8]	validation_0-auc:0.73582
[9]	validation_0-auc:0.73598
[10]	validation_0-auc:0.73609
[11]	validation_0-auc:0.73660
[12]	validation_0-auc:0.73688
[13]	validation_0-auc:0.73694
[14]	validation_0-auc:0.73708
[15]	validation_0-auc:0.73740
[16]	validation_0-auc:0.73747
[17]	validation_0-auc:0.73780
[18]	validation_0-auc:0.73785
[19]	validation_0-auc:0.73781
[20]	validation_0-auc:0.73809
[21]	validation_0-auc:0.73815
[22]	validation_0-auc:0.73839
[23]	validation_0-auc:0.73845
[24]	validation_0-auc:0.73871
[25]	validation_0-auc:0.73892
[26]	validation_0-auc:0.73902
[27]	validation_0-auc:0.73908
[28]	validation_0-auc:0.73917
[29]	validation_0-auc:0.73913
[30]	validation_0-auc:0.73921
[31]	validation_0-auc:0.73943
[32]	validation_0-auc:0.73942
[33]	validation_0-au

In [None]:
# Validation user set
val_users = set(train_rank.iloc[val_idx]["ClientID"].astype(str).unique())

# Use the full candidate set with label
base_val = candidates_labeled.copy()
base_val["ClientID"] = base_val["ClientID"].astype(str)

# Keep only validation users
base_val = base_val[base_val["ClientID"].isin(val_users)].copy()

# Rerank candidates using two different models
rerank_val_lgb = rerank_candidates(base_val, lgb_model, feature, score_col="pred_lgb")
rerank_val_xgb = rerank_candidates(base_val, xgb_model, feature, score_col="pred_xgb")

In [None]:
K = 30
df_long = pd.concat([
    eval_metrics_long(base_val, "Recall", k=K, sort_col="Rank", ascending=True),
    eval_metrics_long(rerank_val_lgb, "Rerank(LGB)", k=K, sort_col="pred_lgb", ascending=False),
    eval_metrics_long(rerank_val_xgb, "Rerank(XGB)", k=K, sort_col="pred_xgb", ascending=False),
], ignore_index=True)

summary_df = (
    df_long
    .pivot(index="Method", columns="Metric", values="Value")
    .reset_index()
)

summary_df

Metric,Method,AvgHits,Coverage,HitRate,NDCG,Precision,Recall,Users
0,Recall,0.14918,0.712063,0.132193,0.053229,0.004973,0.116012,8654.0
1,Rerank(LGB),0.186734,0.491202,0.164664,0.066435,0.006224,0.148514,8654.0
2,Rerank(XGB),0.183383,0.477302,0.162815,0.065699,0.006113,0.146313,8654.0


In [None]:

CLIENT_COUNTRY_COL = "ClientCountry"
STOCK_COUNTRY_COL  = "StoreCountry"
STOCK_QTY_COL      = "Quantity"

PRODUCT_META_COLS = ["Universe", "Category", "Price"]

In [None]:
def map_strategy(hit_sources: str) -> str:
    s = str(hit_sources)
    has_cf = ("ItemCF" in s) or ("UserCF" in s)
    has_pop = ("Popular" in s) or ("StoreRecall" in s)
    if has_cf and has_pop:
        return "Hybrid (CF + Popularity)"
    if has_cf:
        return "CF"
    if has_pop:
        return "Popularity"
    return "Other"

In [None]:
def build_reco_output(
    rerank_df: pd.DataFrame,
    clients: pd.DataFrame,
    products: pd.DataFrame,
    stocks: pd.DataFrame,
    k: int = 10,
    score_col: str = "pred",
    client_country_col: str = CLIENT_COUNTRY_COL,
    stock_country_col: str = STOCK_COUNTRY_COL,
    stock_qty_col: str = STOCK_QTY_COL,
    product_meta_cols: list = PRODUCT_META_COLS
) -> pd.DataFrame:
    df = rerank_df.copy()

    df["ClientID"] = df["ClientID"].astype(str)
    df["ProductID"] = df["ProductID"].astype(str)

    # 排序取 TopK + 生成 Rank(1-k)
    df = df.sort_values(["ClientID", score_col], ascending=[True, False])
    df = df.groupby("ClientID").head(k).copy()
    df["Rank (1-k)"] = df.groupby("ClientID").cumcount() + 1

    # Strategy
    if "HitSources" in df.columns:
        df["Strategy"] = df["HitSources"].apply(map_strategy)
    else:
        df["Strategy"] = "Unknown"

    # ClientCountry
    c_cols = ["ClientID"]
    if client_country_col in clients.columns:
        c_cols.append(client_country_col)
        tmp_clients = clients[c_cols].copy()
        tmp_clients["ClientID"] = tmp_clients["ClientID"].astype(str)
        tmp_clients[client_country_col] = tmp_clients[client_country_col].astype(str)
        df = df.merge(tmp_clients, on="ClientID", how="left")
    else:
        df[client_country_col] = np.nan

    # In_Stock_Flag：ProductID + Country
    if (stock_qty_col in stocks.columns) and (stock_country_col in stocks.columns):
        tmp_stocks = stocks[["ProductID", stock_country_col, stock_qty_col]].copy()
        tmp_stocks["ProductID"] = tmp_stocks["ProductID"].astype(str)
        tmp_stocks[stock_country_col] = tmp_stocks[stock_country_col].astype(str)
        tmp_stocks[stock_qty_col] = pd.to_numeric(tmp_stocks[stock_qty_col], errors="coerce").fillna(0)

        tmp_stocks = tmp_stocks.groupby(["ProductID", stock_country_col], as_index=False)[stock_qty_col].sum()

        df = df.merge(
            tmp_stocks,
            left_on=["ProductID", client_country_col],
            right_on=["ProductID", stock_country_col],
            how="left"
        )
        df["In_Stock_Flag"] = (df[stock_qty_col].fillna(0) > 0).astype(int)
    else:
        df["In_Stock_Flag"] = np.nan

    # Product Metadata：Universe/Category/Price
    meta_cols_exist = ["ProductID"] + [c for c in product_meta_cols if c in products.columns]
    tmp_products = products[meta_cols_exist].copy()
    tmp_products["ProductID"] = tmp_products["ProductID"].astype(str)

    df = df.merge(tmp_products, on="ProductID", how="left")

    meta_fields = [c for c in product_meta_cols if c in df.columns]
    df["Product_Metadata"] = df[meta_fields].to_dict(orient="records") if meta_fields else None

    out = df.rename(columns={
        "ProductID": "Recommended_ProductID",
        score_col: "Score"
    })[
        [
            "ClientID",
            "Recommended_ProductID",
            "Rank (1-k)",
            "Score",
            "Strategy",
            "In_Stock_Flag",
            "Product_Metadata"
        ] + meta_fields
    ].copy()

    return out

In [None]:
final_out_xgb = build_reco_output(
    rerank_df=rerank_val_xgb,
    clients=clients,
    products=products,
    stocks=stocks,
    k=30,
    score_col="pred_xgb"
)

final_out_xgb.head()

Unnamed: 0,ClientID,Recommended_ProductID,Rank (1-k),Score,Strategy,In_Stock_Flag,Product_Metadata,Universe,Category
0,1000031093718265133,2061358648378984723,1,0.508023,Hybrid (CF + Popularity),1,"{'Universe': 'Women', 'Category': 'Football'}",Women,Football
1,1000031093718265133,4228376913636946446,2,0.412605,Hybrid (CF + Popularity),1,"{'Universe': 'Women', 'Category': 'Football'}",Women,Football
2,1000031093718265133,8085696973862495176,3,0.408949,Hybrid (CF + Popularity),1,"{'Universe': 'Men', 'Category': 'Football'}",Men,Football
3,1000031093718265133,6448349970795474714,4,0.407192,Hybrid (CF + Popularity),1,"{'Universe': 'Women', 'Category': 'Football'}",Women,Football
4,1000031093718265133,4977027123522759904,5,0.401817,Hybrid (CF + Popularity),1,"{'Universe': 'Women', 'Category': 'Tennis'}",Women,Tennis


In [None]:
final_out_xgb.to_csv("recommendations_xgb_top30.csv", index=False)

In [None]:
final_out_lgb = build_reco_output(
    rerank_df=rerank_val_lgb,
    clients=clients,
    products=products,
    stocks=stocks,
    k=30,
    score_col="pred_lgb"
)
final_out_lgb.head()

Unnamed: 0,ClientID,Recommended_ProductID,Rank (1-k),Score,Strategy,In_Stock_Flag,Product_Metadata,Universe,Category
0,1000031093718265133,2061358648378984723,1,0.449168,Hybrid (CF + Popularity),1,"{'Universe': 'Women', 'Category': 'Football'}",Women,Football
1,1000031093718265133,8085696973862495176,2,0.424145,Hybrid (CF + Popularity),1,"{'Universe': 'Men', 'Category': 'Football'}",Men,Football
2,1000031093718265133,4977027123522759904,3,0.401349,Hybrid (CF + Popularity),1,"{'Universe': 'Women', 'Category': 'Tennis'}",Women,Tennis
3,1000031093718265133,6448349970795474714,4,0.385281,Hybrid (CF + Popularity),1,"{'Universe': 'Women', 'Category': 'Football'}",Women,Football
4,1000031093718265133,4228376913636946446,5,0.348852,Hybrid (CF + Popularity),1,"{'Universe': 'Women', 'Category': 'Football'}",Women,Football


In [None]:
final_out_lgb.to_csv("recommendations_lgb_top30.csv", index=False)