In [1]:
from Question1 import *
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from lightgbm.sklearn import LGBMRanker
from scipy.sparse import csr_matrix

In [2]:
BASE_PATH = '../Data/'
transactions = pd.read_parquet(BASE_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(BASE_PATH + 'customers.parquet')
articles = pd.read_parquet(BASE_PATH + 'articles.parquet')
sample_submission = pd.read_csv(BASE_PATH + 'sample_submission.csv')

In [3]:
# Candidate generation of Radek notebook
def get_data(data, test_week):
    ### repurchase
    # each week is seen as a basket
    # the items bought in one basket, will be example for the next basket
    # the items bought in the last basket, will be candidates for the test basket
    candidates_last_purchase = data.copy()
    c2weeks = data.groupby('customer_id')['week'].unique()
    
    c2weeks2shifted_weeks = {}
    for c_id, weeks in c2weeks.items():
        shifted_weeks = weeks[1:].tolist() + [test_week]
        c2weeks2shifted_weeks[c_id] = dict(zip(weeks, shifted_weeks))

    candidates_last_purchase['week'] = [
        c2weeks2shifted_weeks[c_id][week]
        for c_id, week in zip(data['customer_id'], data['week'])
    ]

    ### bestseller
    # if a user bought an item in a given week, the 12 most popular items in the previous week are example for that week
    # the best selling items in the last week are candidates for all users
    mean_price = data \
        .groupby(['week', 'article_id'])['price'].mean()
    sales = data \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')
    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1
    unique_transactions = data \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()
    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )
    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = test_week
    candidates_bestsellers_test_week = pd.merge(
        test_set_transactions,
        bestsellers_previous_week,
        on='week'
    )
    candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
    candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

    ### combine
    d = data.copy()
    d['purchased'] = True
    
    result = pd.concat([
        d, candidates_last_purchase, candidates_bestsellers
    ])
    result.purchased.fillna(False, inplace=True)
    result.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

    result = pd.merge(
        result,
        bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
        on=['week', 'article_id'],
        how='left'
    )
    result = result[result.week != result.week.min()]
    result.bestseller_rank.fillna(999, inplace=True)

    result.sort_values(['week', 'customer_id'], inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result

def get_examples(data, test_week):
    data = get_data(data, test_week)
    return data[data.week != test_week]

def get_candidates(data, test_week):
    data = get_data(data, test_week)
    return data[data.week == test_week]

def add_features(data):
    columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id','perceived_colour_master_id', 'department_no', 'index_code','index_group_no', 'section_no', 'garment_group_no', 'price','score'
    ]

    result = data
    result = pd.merge(result, customers, how='left', on='customer_id')
    result = pd.merge(result, articles, how='left', on='article_id')

    result['score'] = result.apply(get_score,axis=1).fillna(0)
    
    return result[columns_to_use]

def get_score(entry):
    try:
        return ui_score.loc[entry['customer_id'], entry['article_id']]
    except KeyError:
        return None

In [4]:
def get_sim(recmodel,purchase_sparse):
    s2 = cosine_similarity(recmodel.articles_latent_matrix, recmodel.articles_latent_matrix)
    return purchase_sparse.dot(s2)

def apply_filter(scores, filter_matrix):
    chunk_size = 10000
    num_rows, num_cols = scores.shape
    result = np.zeros((num_rows, num_cols))
    
    for i in range(0, num_rows, chunk_size):
        chunk_end = min(i + chunk_size, num_rows)
        ui_chunk = scores.iloc[i:chunk_end].values
        filter_chunk = filter_matrix[i:chunk_end]
        result[i:chunk_end] = np.multiply(ui_chunk, filter_chunk)
    
    return pd.DataFrame(result, index=scores.index, columns=scores.columns)


def get_useritem_data(recmodel):
    itemcf_transactions['article_id'] = itemcf_transactions['article_id'].astype(int)
    purchase_counts = itemcf_transactions.groupby(['customer_id', 'article_id']).size().rename('count').reset_index().sort_values('article_id')
    
    user_to_index = {user_id: index for index, user_id in enumerate(purchase_counts['customer_id'].unique())}
    article_to_index = {article_id: index for index, article_id in enumerate(purchase_counts['article_id'].unique())}
    
    row_indices = purchase_counts['customer_id'].map(user_to_index).values
    col_indices = purchase_counts['article_id'].map(article_to_index).values
    spdata = purchase_counts['count'].values
    
    purchase_counts_sparse = csr_matrix((spdata, (row_indices, col_indices)), shape=(len(user_to_index), len(article_to_index)), dtype=int)
    result = pd.DataFrame(get_sim(recmodel,purchase_counts_sparse), index=user_to_index.keys(), columns=article_to_index.keys())
    
    purchase_counts_sparse = csr_matrix((np.ones_like(spdata), (row_indices, col_indices)), shape=(len(user_to_index), len(article_to_index)), dtype=int).toarray()
    filter_matrix = 1 - purchase_counts_sparse
    del purchase_counts
    
    return apply_filter(result,filter_matrix)

In [5]:
### split into training and testing
# one week is used for testing
# a number of weeks leading up to the test week are used to train the ranker
test_week = 104
num_training_weeks = 10
testing_weeks = np.arange(test_week-num_training_weeks, test_week)
train_data = transactions[transactions.week.isin(testing_weeks)].reset_index(drop=True)

In [6]:
np.random.seed(100)

itemcf_transactions = train_data.copy().drop(['sales_channel_id', 'price', 'week'], axis=1)
most_bought_articles = itemcf_transactions['article_id'].value_counts()[lambda x: x > 10].index
itemcf_transactions = itemcf_transactions[itemcf_transactions['article_id'].isin(most_bought_articles)]
itemcf_transactions['purchased'] = 1

negative_samples = pd.DataFrame({
    'article_id': np.random.choice(itemcf_transactions.article_id.unique(), itemcf_transactions.shape[0]),
    'customer_id': np.random.choice(itemcf_transactions.customer_id.unique(), itemcf_transactions.shape[0]),
    'purchased': np.zeros(itemcf_transactions.shape[0])
})

rec = ItemCF(itemcf_transactions, negative_samples, num_components=1000)
rec.fit(n_epochs=2)

Epoch: 0


100%|██████████| 5490618/5490618 [01:09<00:00, 78701.67it/s]


0.4835677681836136
Epoch: 1


100%|██████████| 5490618/5490618 [01:10<00:00, 78055.67it/s]

0.27997658968282624





In [7]:
%%time
ui_score = get_useritem_data(rec)

CPU times: total: 1min 16s
Wall time: 11min 29s


In [8]:
### assemble training data (positive + negative examples)
# each example has at least a customer_id, article_id and whether it was purchased or not (positive/negative)
# add_features extracts and adds features to the examples
train_examples = get_examples(train_data, test_week)
X_train = add_features(train_examples)
Y_train = train_examples['purchased']

### fit ranker
# training_groups tells LGBM that each (week, customer_id) combination is a seperate basket
# !!! it is important that the training_examples are sorted according to week, customer_id for this to work
ranker = LGBMRanker(
    force_row_wise=True,
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)
train_groups = train_examples.groupby(['week', 'customer_id'])['article_id'].count().values
ranker.fit(X_train, Y_train, group=train_groups)
print_importance(ranker, X_train.columns)



[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.112795
[LightGBM] [Info] Total Bins 1297
[LightGBM] [Info] Number of data points in the train set: 11557594, number of used features: 13
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
                         score 0.99563
                         price 0.00319
                    article_id 0.00049
              garment_group_no 0.00028
                 department_no 0.00016
               product_type_no 0.00015
                    section_no 0.00006
             colour_group_code 0.00004
                index_group_no 0.00000
                    index_code 0.00000
    perceived_colour_master_id 0.00000
     perceived_colour_value_id 0.00000
       graphical_appearance_no 0.00000


In [9]:
%%time
### test
# candidates are generated similarly to the examples, only we don't know whether they are purchased
# the same features are extracted and added
# each candidate is scored by the ranker and predictions are generated using the highest scoring candidates
test_candidates = get_candidates(train_data, test_week)
X_test = add_features(test_candidates)
predictions = get_predictions(test_candidates, X_test, ranker, 12)

### evaluate
if test_week < transactions.week.max() + 1:
    # get ground truth data for test week
    purchases = get_purchases(transactions[transactions.week == test_week])
    
    # fill missing prediction for customers in test set with popular items in last week
    # only for customers in test set because only those are evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, purchases.customer_id, popular)
    
    # calculate score
    score = mean_average_precision(predictions, purchases, 12)
    print(score)

### submit
else:
    # fill missing predictions for all customers with popular items in last week
    # all customers because we don't know which ones will be evaluated
    popular = transactions[transactions.week == test_week-1].article_id.value_counts().head(12).index.values
    predictions = fill_missing_predictions(predictions, customers.customer_id, popular)

    # write submission
    sub = create_submission(predictions,sample_submission)
    sub.to_csv('output/' + 'sub1.csv.gz', index=False)

0.025965239693955543
CPU times: total: 48.1 s
Wall time: 3min 36s


In [10]:
print(testing_weeks)

[ 94  95  96  97  98  99 100 101 102 103]


In [19]:
def recall(predictions, purchases, k=12):
    def calculate_recall(row):
        intersect_count = len(set(row['prediction'][:k]).intersection(row['purchases']))
        return intersect_count / min(len(row['purchases']), k) if len(row['purchases']) > 0 else 0

    result = pd.merge(purchases, predictions, on="customer_id", how="inner")
    result['recall'] = result.apply(calculate_recall, axis=1)

    return result['recall'].mean()

In [20]:
recall(predictions,purchases,12)

0.05234812642001266