In [1]:
from Question1 import *
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from lightgbm.sklearn import LGBMRanker
from scipy.sparse import csr_matrix

In [2]:
BASE_PATH = '../Data/'
transactions = pd.read_parquet(BASE_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(BASE_PATH + 'customers.parquet')
articles = pd.read_parquet(BASE_PATH + 'articles.parquet')
sample_submission = pd.read_csv(BASE_PATH + 'sample_submission.csv')

In [30]:
# Candidate generation of Radek notebook
def get_data(data, test_week):
    ### repurchase
    # each week is seen as a basket
    # the items bought in one basket, will be example for the next basket
    # the items bought in the last basket, will be candidates for the test basket
    candidates_last_purchase = data.copy()
    c2weeks = data.groupby('customer_id')['week'].unique()
    
    c2weeks2shifted_weeks = {}
    for c_id, weeks in c2weeks.items():
        shifted_weeks = weeks[1:].tolist() + [test_week]
        c2weeks2shifted_weeks[c_id] = dict(zip(weeks, shifted_weeks))

    candidates_last_purchase['week'] = [
        c2weeks2shifted_weeks[c_id][week]
        for c_id, week in zip(data['customer_id'], data['week'])
    ]

    ### bestseller
    # if a user bought an item in a given week, the 12 most popular items in the previous week are example for that week
    # the best selling items in the last week are candidates for all users
    mean_price = data \
        .groupby(['week', 'article_id'])['price'].mean()
    sales = data \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')
    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1
    unique_transactions = data \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()
    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )
    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = test_week
    candidates_bestsellers_test_week = pd.merge(
        test_set_transactions,
        bestsellers_previous_week,
        on='week'
    )
    candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
    candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

    ### combine
    d = data.copy()
    d['purchased'] = True
    
    result = pd.concat([
        d, candidates_last_purchase, candidates_bestsellers
    ])
    result.purchased.fillna(False, inplace=True)
    result.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

    result = pd.merge(
        result,
        bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
        on=['week', 'article_id'],
        how='left'
    )
    result = result[result.week != result.week.min()]
    result.bestseller_rank.fillna(999, inplace=True)

    result.sort_values(['week', 'customer_id'], inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result

def get_examples(data, test_week):
    data = get_data(data, test_week)
    return data[data.week != test_week]

def get_candidates(data, test_week):
    data = get_data(data, test_week)
    return data[data.week == test_week]

def add_features(data, columns_to_use = None):
    if not columns_to_use:
        columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id','perceived_colour_master_id', 'department_no', 'index_code','index_group_no', 'section_no', 'garment_group_no','score','price'
        ]

    result = data
    result = pd.merge(result, customers, how='left', on='customer_id')
    result = pd.merge(result, articles, how='left', on='article_id')

    result['score'] = result.apply(get_score,axis=1).fillna(0)
    
    return result[columns_to_use]

def get_score(entry):
    """Method that returns the user-item score given a dataframe row containing columns [customer_id,article_id]"""
    try:
        return ui_score.loc[entry['customer_id'], entry['article_id']]
    except KeyError:
        return None

def recall(predictions, purchases, k=12):
    def calculate_recall(row):
        intersect_count = len(set(row['prediction'][:k]).intersection(row['purchases']))
        return intersect_count / min(len(row['purchases']), k) if len(row['purchases']) > 0 else 0

    result = pd.merge(purchases, predictions, on="customer_id", how="inner")
    result['recall'] = result.apply(calculate_recall, axis=1)

    return result['recall'].mean()

In [4]:
def get_sim(recmodel,purchase_sparse):
    """Method to get the dot product of an item similarity matrix with the articles frequency of every user"""
    s2 = cosine_similarity(recmodel.articles_latent_matrix, recmodel.articles_latent_matrix)
    return purchase_sparse.dot(s2)

def apply_filter(scores, filter_matrix):
    """Method to apply the filter that prunes already purchased articles for every user. It's done in chunks to be less RAM heavy"""
    chunk_size = 10000
    num_rows, num_cols = scores.shape
    result = np.zeros((num_rows, num_cols))
    
    for i in range(0, num_rows, chunk_size):
        chunk_end = min(i + chunk_size, num_rows)
        ui_chunk = scores.iloc[i:chunk_end].values
        filter_chunk = filter_matrix[i:chunk_end]
        result[i:chunk_end] = np.multiply(ui_chunk, filter_chunk)
    
    return pd.DataFrame(result, index=scores.index, columns=scores.columns)


def get_useritem_data(recmodel):
    """Method that returns the user-item interaction matrix with scores"""
    itemcf_transactions['article_id'] = itemcf_transactions['article_id'].astype(int)
    purchase_counts = itemcf_transactions.groupby(['customer_id', 'article_id']).size().rename('count').reset_index().sort_values('article_id') # article frequency matrix 
    
    user_to_index = {user_id: index for index, user_id in enumerate(purchase_counts['customer_id'].unique())}
    article_to_index = {article_id: index for index, article_id in enumerate(purchase_counts['article_id'].unique())}
    
    row_indices = purchase_counts['customer_id'].map(user_to_index).values
    col_indices = purchase_counts['article_id'].map(article_to_index).values
    spdata = purchase_counts['count'].values
    
    # sparse matrix to preserve RAM
    purchase_counts_sparse = csr_matrix((spdata, (row_indices, col_indices)), shape=(len(user_to_index), len(article_to_index)), dtype=int)
    
    # dataframe of the user-item matrix
    result = pd.DataFrame(get_sim(recmodel,purchase_counts_sparse), index=user_to_index.keys(), columns=article_to_index.keys())
    
    # create a matrix containing 1's for items not bought by the user and 0 for item's that were already purchased
    purchase_counts_sparse = csr_matrix((np.ones_like(spdata), (row_indices, col_indices)), shape=(len(user_to_index), len(article_to_index)), dtype=int).toarray()
    filter_matrix = 1 - purchase_counts_sparse
    del purchase_counts
    
    return apply_filter(result,filter_matrix)

In [5]:
### split into training and testing
# one week is used for testing
# a number of weeks leading up to the test week are used to train the ranker
test_week = 104
num_training_weeks = 10
testing_weeks = np.arange(test_week-num_training_weeks, test_week)
train_data = transactions[transactions.week.isin(testing_weeks)].reset_index(drop=True)

## CF preprocessing
We take only the articles bought more than 10 times in the training weeks

In [6]:
itemcf_transactions = train_data.copy().drop(['sales_channel_id', 'price', 'week'], axis=1)
most_bought_articles = itemcf_transactions['article_id'].value_counts()[lambda x: x > 10].index
itemcf_transactions = itemcf_transactions[itemcf_transactions['article_id'].isin(most_bought_articles)]
itemcf_transactions['purchased'] = 1
itemcf_transactions

Unnamed: 0,t_dat,customer_id,article_id,purchased
0,2020-07-08,857913002275398,599580068,1
1,2020-07-08,857913002275398,776237011,1
2,2020-07-08,857913002275398,844294001,1
3,2020-07-08,1658289241058394,877773001,1
4,2020-07-08,3828854365940846,507883009,1
...,...,...,...,...
2809228,2020-09-15,18446630855572834764,568601045,1
2809229,2020-09-15,18446630855572834764,568601045,1
2809230,2020-09-15,18446630855572834764,898713001,1
2809231,2020-09-15,18446630855572834764,898713001,1


### Generate negative candidates

### Train CF model using SGD algorithm
I've imported a pre-trained model, the same one as in notebook RQ1

In [22]:
import pickle
with open('output/60_1000.pickle','rb') as file:
    rec = pickle.load(file)

Calculate the user-item interaction matrix by using the data from the CF model

In [23]:
%%time
ui_score = get_useritem_data(rec)

CPU times: total: 1min 14s
Wall time: 8min 45s


In [24]:
ui_score.describe()

Unnamed: 0,108775044,111565001,111586001,111593001,111609001,120129001,120129014,123173001,126589010,129085001,...,947168001,947509001,947934001,949198001,949551001,949551002,952267001,953450001,953763001,956217002
count,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0,...,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0,437279.0
mean,0.002714,-0.000307,0.001153,-0.010573,-0.00539,-0.004359,0.008906,-0.000444,-0.005152,0.005313,...,0.001483,-0.002138,0.007459,0.007834,0.003945,-0.002832,0.000413,-0.008401,-0.011353,-0.008325
std,0.094877,0.093943,0.093916,0.092221,0.09447,0.093073,0.094876,0.093714,0.092812,0.091845,...,0.090319,0.091073,0.091034,0.093892,0.092851,0.093297,0.091268,0.093498,0.094684,0.094458
min,-4.041761,-2.336787,-2.26662,-2.672708,-3.181316,-3.203716,-3.051396,-1.850657,-3.577077,-1.818405,...,-1.91689,-2.474271,-1.313247,-5.925495,-2.478896,-3.269933,-2.090865,-2.854247,-2.34447,-2.678586
25%,-0.041688,-0.044187,-0.042833,-0.051062,-0.047232,-0.046882,-0.037306,-0.044035,-0.047596,-0.038898,...,-0.041013,-0.044818,-0.036344,-0.037021,-0.04022,-0.045832,-0.042934,-0.049267,-0.053492,-0.050177
50%,0.001708,-0.000307,0.001084,-0.006233,-0.002644,-0.002759,0.004634,2.8e-05,-0.003742,0.003415,...,0.000748,-0.002022,0.005084,0.005189,0.002342,-0.001402,-3.9e-05,-0.004557,-0.007284,-0.005382
75%,0.046932,0.043255,0.044577,0.034864,0.039801,0.040188,0.050241,0.043988,0.039108,0.046876,...,0.043246,0.041077,0.048694,0.04966,0.045987,0.041485,0.043731,0.037388,0.035096,0.037598
max,2.346311,2.59837,4.525209,2.467519,1.743278,4.603321,2.622148,2.758256,1.375848,3.8768,...,2.308177,1.567481,3.755734,3.107385,2.288731,2.717287,1.894666,2.868778,1.717103,2.326091


Interesting to see that more than 75% of the articles have a mediocre user-item score, when only a few have a score higher than 1.

These scores are also the same for the scores in RQ1

In [31]:
### assemble training data (positive + negative examples)
# each example has at least a customer_id, article_id and whether it was purchased or not (positive/negative)
# add_features extracts and adds features to the examples
train_examples = get_examples(train_data, test_week)

columnsCF = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id','perceived_colour_master_id', 'department_no', 'index_code','index_group_no', 'section_no', 'garment_group_no','score','price']
X_trainCF = add_features(train_examples,columnsCF)
Y_trainCF = train_examples['purchased']

columnsPop = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id','perceived_colour_master_id', 'department_no', 'index_code','index_group_no', 'section_no', 'garment_group_no','bestseller_rank','price'
        ]
X_trainPop = add_features(train_examples,columnsPop)
Y_trainPop = train_examples['purchased']

In [32]:
### fit collaborative filtering ranker
# training_groups tells LGBM that each (week, customer_id) combination is a seperate basket
# !!! it is important that the training_examples are sorted according to week, customer_id for this to work
rankerCF = LGBMRanker(
    force_row_wise=True,
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
train_groups = train_examples.groupby(['week', 'customer_id'])['article_id'].count().values
rankerCF.fit(X_trainCF, Y_trainCF, group=train_groups)
print_importance(rankerCF, X_trainCF.columns)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.112795
[LightGBM] [Info] Total Bins 1297
[LightGBM] [Info] Number of data points in the train set: 11557594, number of used features: 13
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
                         score 0.99563
                         price 0.00319
                    article_id 0.00049
              garment_group_no 0.00028
                 department_no 0.00016
               product_type_no 0.00015
                    section_no 0.00006
             colour_group_code 0.00004
                index_group_no 0.00000
                    index_code 0.00000
    perceived_colour_master_id 0.00000
     perceived_colour_value_id 0.00000
       graphical_appearance_no 0.00000


In [33]:
### fit popularity ranker 
# training_groups tells LGBM that each (week, customer_id) combination is a seperate basket
# !!! it is important that the training_examples are sorted according to week, customer_id for this to work
rankerPop = LGBMRanker(
    force_row_wise=True,
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
train_groups = train_examples.groupby(['week', 'customer_id'])['article_id'].count().values
rankerPop.fit(X_trainPop, Y_trainPop, group=train_groups)
print_importance(rankerPop, X_trainPop.columns)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.090095
[LightGBM] [Info] Total Bins 1056
[LightGBM] [Info] Number of data points in the train set: 11557594, number of used features: 13
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13
               bestseller_rank 0.97353
                         price 0.02543
     perceived_colour_value_id 0.00031
                    article_id 0.00027
       graphical_appearance_no 0.00022
              garment_group_no 0.00008
                    section_no 0.00008
                 department_no 0.00005
               product_type_no 0.00004
                index_group_no 0.00000
                    index_code 0.00000
    perceived_colour_master_id 0.00000
             colour_group_code 0.00000


In [None]:
%%time
### test
# candidates are generated similarly to the examples, only we don't know whether they are purchased
# the same features are extracted and added
# each candidate is scored by the ranker and predictions are generated using the highest scoring candidates
test_candidates = get_candidates(train_data, test_week)
X_test = add_features(test_candidates)

In [38]:
def get_predictions_dataframe(scored_candidates,k=12):
    """Method to get a dataframe with predictions from the ranker.predict() function"""
    return (
        scored_candidates.sort_values(["customer_id", "score"], ascending=False)
        .groupby("customer_id")
        .head(k)
        .groupby("customer_id", as_index=False)
        .article_id.apply(list)
        .rename(columns={"article_id": "prediction"})[["customer_id", "prediction"]]
    )

predsCF = rankerCF.predict(X_test)
predsPop = rankerPop.predict(X_test)

predictions5050 = test_candidates.copy()
predictions5050["score"] = predsCF * 0.5 + predsPop * 0.5

predictions8020 = test_candidates.copy()
predictions8020["score"] = predsCF * 0.8 + predsPop * 0.2

predictions2080 = test_candidates.copy()
predictions2080["score"] = predsCF * 0.2 + predsPop * 0.8

predictions5050 = get_predictions_dataframe(predictions5050)
predictions8020 = get_predictions_dataframe(predictions8020)
predictions2080 = get_predictions_dataframe(predictions2080)

In [39]:
### evaluate
# get ground truth data for test week
purchases = get_purchases(transactions[transactions.week == test_week])

# fill missing prediction for customers in test set with popular items in last week
# only for customers in test set because only those are evaluated
popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values

predictions5050 = fill_missing_predictions(predictions5050, purchases.customer_id, popular)
predictions8020 = fill_missing_predictions(predictions8020, purchases.customer_id, popular)
predictions2080 = fill_missing_predictions(predictions2080, purchases.customer_id, popular)

In [40]:
# calculate score
score5050 = mean_average_precision(predictions5050, purchases, 12)
score8020 = mean_average_precision(predictions8020, purchases, 12)
score2080 = mean_average_precision(predictions2080, purchases, 12)
print(f"MAP@12:\n\t-50/50: {score5050}\n\t-80/20: {score8020}\n\t-20/80: {score2080}")

MAP@12:
	-50/50: 0.025306594568960544
	-80/20: 0.0256529535993688
	-20/80: 0.024910298083689143


In [41]:
 # calculate recall
recall5050 = recall(predictions5050, purchases, 12)
recall8020 = recall(predictions8020, purchases, 12)
recall2080 = recall(predictions2080, purchases, 12)
print(f"recall@12:\n\t-50/50: {recall5050}\n\t-80/20: {recall8020}\n\t-20/80: {recall2080}")

recall@12:
	-50/50: 0.051982025779573025
	-80/20: 0.05200284381145161
	-20/80: 0.05195246510515268
