#XGBoost Ranker Recommended system

In [None]:
# import library
import os
import sys
import gc

import random
from datetime import datetime
from tqdm import notebook
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
from typing import List
import xgboost as xgb
from typing import Optional, Tuple


##Prepare data

In [None]:
# data from Kaggle
article_pth = '/content/articles.csv'
transaction_pth = '/content/transactions_train.csv'
customer_pth = '/content/customers.csv'

# feature extraction
adv_user_feature_pth = '/content/cust_features'
adv_item_feature_pth = '/content/item_features.parquet'


In [None]:
# cust feature
user_features = pd.read_parquet(adv_user_feature_pth)
user_features[['club_member_status', 'fashion_news_frequency']] = (
    user_features[['club_member_status', 'fashion_news_frequency']]
    .apply(lambda x: pd.factorize(x)[0])
).astype('int8')
user_features = user_features.reset_index()
customer_df = pd.read_csv(customer_pth)

# item feature
article_df = pd.read_csv(article_pth)
article_df['article_id'] = '0' + article_df['article_id'].astype(str)
item_features = pd.read_parquet(adv_item_feature_pth)
item_features = item_features.reset_index()
item_features['article_id'] = '0' + item_features['article_id'].astype(str)

# transaction data
transaction_df = pd.read_csv(transaction_pth)
transaction_df['t_dat'] = pd.to_datetime(transaction_df['t_dat'])
transaction_df['article_id'] = '0' + transaction_df['article_id'].astype(str)
transaction_df['week'] = 104 - (transaction_df.t_dat.max() - transaction_df.t_dat).dt.days // 7


##NegativeSampling

In [None]:
from typing import Optional, Tuple
import pandas as pd


class NegativeSampling:

    def __init__(self, transaction_df: pd.DataFrame, train_inteval: int = 10):
        self.transaction_df = transaction_df
        self.train_trans, self.valid_trans = self._train_valid_split(
            transaction_df, train_inteval
        )
        self.valid_week = transaction_df.week.max()

    def _train_valid_split(
        self, transaction_df: pd.DataFrame, train_interval: int
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        valid_trans = transaction_df[transaction_df.week == transaction_df.week.max()]
        train_trans = transaction_df[
            (transaction_df.week != transaction_df.week.max())
            & (transaction_df.week > transaction_df.week.max() - train_interval)
        ]
        return train_trans, valid_trans

    def create_data_with_neg_sample(
        self,
        extra_user_features: Optional[pd.DataFrame] = None,
        extra_item_features: Optional[pd.DataFrame] = None,
    ) -> pd.DataFrame:
        # combine transaction and negative samples (candidates)
        train_trans = self.train_trans.copy()
        train_trans["purchased"] = 1

        candidates_last_purchase = self._find_last_purchase()
        candidates_bestsellers, bestsellers_previous_week = self._find_bestsellers()

        self.bestsellers_previous_week = bestsellers_previous_week

        data = pd.concat(
            [train_trans, candidates_last_purchase, candidates_bestsellers]
        )
        data.purchased.fillna(0, inplace=True)
        data.drop_duplicates(["customer_id", "article_id", "week"], inplace=True)
        data = pd.merge(
            data,
            bestsellers_previous_week[["week", "article_id", "bestseller_rank"]],
            on=["week", "article_id"],
            how="left",
        )

        data = data[data.week != data.week.min()]
        data.bestseller_rank.fillna(999, inplace=True)

        if extra_item_features is not None:
            data = pd.merge(data, extra_item_features, on="article_id", how="left")
        if extra_user_features is not None:
            data = pd.merge(data, extra_user_features, on="customer_id", how="left")

        data.sort_values(["week", "customer_id"], inplace=True)
        data.reset_index(drop=True, inplace=True)

        return data

    def _find_last_purchase(self) -> pd.DataFrame:
        c2weeks = self.transaction_df.groupby("customer_id")["week"].unique()

        c2weeks2shifted_weeks = {}
        for c_id, weeks in c2weeks.items():
            c2weeks2shifted_weeks[c_id] = {}
            for i in range(weeks.shape[0] - 1):
                c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i + 1]

            c2weeks2shifted_weeks[c_id][weeks[-1]] = self.valid_week

        candidates_last_purchase = self.train_trans.copy()
        weeks = []
        for i, (c_id, week) in enumerate(
            zip(self.train_trans["customer_id"], self.train_trans["week"])
        ):
            weeks.append(c2weeks2shifted_weeks[c_id][week])

        candidates_last_purchase.week = weeks

        return candidates_last_purchase

    def _find_bestsellers(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        mean_price = self.train_trans.groupby(["week", "article_id"])["price"].mean()
        sales = (
            self.train_trans.groupby("week")["article_id"]
            .value_counts()
            .groupby("week")
            .rank(method="dense", ascending=False)
            .groupby("week")
            .head(12)
            .rename("bestseller_rank")
            .astype("int8")
        )

        bestsellers_previous_week = pd.merge(
            sales, mean_price, on=["week", "article_id"]
        ).reset_index()
        bestsellers_previous_week.week += 1

        unique_transactions = (
            self.train_trans.groupby(["week", "customer_id"])
            .head(1)
            .drop(columns=["article_id", "price"])
        ).copy()

        candidates_bestsellers = pd.merge(
            unique_transactions,
            bestsellers_previous_week,
            on="week",
        )

        valid_set_transactions = unique_transactions.drop_duplicates(
            "customer_id"
        ).reset_index(drop=True)
        valid_set_transactions.week = self.valid_week

        candidates_bestsellers_valid_week = pd.merge(
            valid_set_transactions, bestsellers_previous_week, on="week"
        )

        candidates_bestsellers = pd.concat(
            [candidates_bestsellers, candidates_bestsellers_valid_week]
        )
        candidates_bestsellers.drop(columns="bestseller_rank", inplace=True)

        return candidates_bestsellers, bestsellers_previous_week


##Preprocessing

In [None]:
# Preprocessing
neg_sampling = NegativeSampling(
    transaction_df=transaction_df, train_inteval=10
)
data = neg_sampling.create_data_with_neg_sample(
    extra_user_features=user_features,
    extra_item_features=item_features,
)

In [None]:
# train/valid
valid_week = neg_sampling.valid_week
train = data[data.week != valid_week]
valid = data[data.week==valid_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

train_X = train.drop(columns=['purchased', 't_dat', 'price', 'sales_channel_id', 'customer_id', 'article_id', 'week'])
train_y = train['purchased']
valid_X = valid.drop(columns=['purchased', 't_dat', 'price', 'sales_channel_id', 'customer_id', 'article_id', 'week'])

##Trainning

####XGBRanker model


In [None]:
# make group for training
train_baskets = train.groupby(
    ['week', 'customer_id']
)['article_id'].count().values

# Parameter
xgb_ranker = xgb.XGBRanker(
    eta= 0.5,
    max_depth= 10,
    n_estimators= 100
)
# Trainning
xgb_ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

##Prediction & Measurement

###Make MAP metrics function

In [None]:
def apk(actual, predicted, k=12):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    # Lặp qua các dự đoán
    for i, p in enumerate(predicted):
        # nếu dự đoán có trong actual và chưa được dự đoán trước đó
        if p in actual and p not in predicted[:i]:
            # tăng num_hits
            num_hits += 1.0
            # tính điểm ap tại phần tử đó
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0
    # trả về điểm trung bình ap cho dãy dự đoán
    return score / min(len(actual), k)


def mapk(
    measure_df: pd.DataFrame,
    pred_col: str = "prediction",
    ground_true_col: str = "ground_true",
    k=12,
):
    apks = []
    pred_list: List[List[str]] = measure_df[pred_col].to_list()
    ground_true_list: List[List[str]] = measure_df[ground_true_col].to_list()
    for pred, g_true in zip(pred_list, ground_true_list):
        apks.append(apk(g_true, pred, k=12))
    return np.mean(apks)

###Thực hiện dự đoán và đánh giá mô hình

####XGBRanker model

In [None]:
# Make file for evaluation
valid_sub = valid.loc[:, ["customer_id"]]

# evaluation
valid['preds'] = xgb_ranker.predict(valid_X)

c_id2predicted_article_ids = (
    valid
    .sort_values(['customer_id', 'preds'], ascending=False)
    .groupby('customer_id')['article_id'].apply(list).to_dict()
)

preds = []
for c_id in valid_sub.customer_id:
    pred = c_id2predicted_article_ids.get(c_id, [])
    preds.append(pred[:12]) #tối đa 12 sản phẩm

preds = [' '.join([str(p) for p in ps]) for ps in preds]
valid_sub['prediction'] = preds

valid_ground_true = neg_sampling.valid_trans.groupby(
    'customer_id', as_index=False
).agg(ground_true=('article_id', list))

valid_measure_df = valid_sub[['customer_id', 'prediction']]
valid_measure_df = valid_measure_df.merge(valid_ground_true, on='customer_id', how='inner')
valid_measure_df['prediction'] = [pred.split(' ') for pred in list(valid_measure_df['prediction'].values)]
valid_mapk_xgb = mapk(valid_measure_df, pred_col='prediction', ground_true_col='ground_true', k=12)
valid_mapk_xgb

0.03707456781871971