In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import json
import os

os.chdir('../../../data/')

In [15]:
articles_cols = [
    'article_id', 
    'product_code',
    'product_type_no', 
    'graphical_appearance_no', 
    'colour_group_code', 
    'perceived_colour_value_id',
    'perceived_colour_master_id', 
    'department_no', 
    'index_code',
    'index_group_no', 
    'section_no', 
    'garment_group_no'
    ]

In [16]:
# Read compressed data
transactions = pd.read_pickle('compressed_data/transactions_train.pkl')
customers = pd.read_pickle('compressed_data/customers.pkl')
articles = pd.read_pickle('compressed_data/articles.pkl')[articles_cols]

# Calculate week, where 0 is first week of data and 104 is last week of data
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

print('First week num: ', transactions.week.min(), '\nLast week num: ', transactions.week.max(), '\n')

avg_age = np.mean(customers['age'].astype('float32'))
customers['age'].fillna(avg_age.astype(np.float16), inplace=True)

First week num:  0 
Last week num:  104 



In [17]:
test_week = transactions.week.max() + 1

train_weeks = range(test_week - 10, test_week)

transactions_train = transactions[transactions.week.isin(train_weeks)]
transaction_test = transactions[transactions.week == test_week]

## Radek's candidates

In [18]:
radek_transactions = transactions[transactions.week > transactions.week.max() - 10]

c2weeks = radek_transactions.groupby('customer_id')['week'].unique()

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

candidates_last_purchase = radek_transactions.copy()

weeks = []
for i, (c_id, week) in enumerate(zip(radek_transactions['customer_id'], radek_transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

# bestseller
mean_price = radek_transactions.groupby(['week', 'article_id'])['price'].mean()
sales = radek_transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

unique_transactions = radek_transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

test_set_transactions = unique_transactions\
    .drop_duplicates('customer_id')\
    .reset_index(drop=True)

test_set_transactions.week = test_week

candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

# Concatenate data with test data
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [19]:
candidates_bestsellers.to_csv('candidates/radek_bestsellers.csv', index=False)
bestsellers_previous_week.to_csv('candidates/radek_bestsellers_previous_week.csv', index=False)

In [20]:
candidates_last_purchase.to_csv('candidates/radek_last_purchase.csv', index=False)

## Seasonal candidates -- items bought in previous years during similar period to test week (mid september)

In [35]:
seasonal_trans = transactions[(transactions.t_dat.dt.month == 9) & (transactions.t_dat.dt.year.isin([2019, 2018]))]
seasonal_trans['year'] = transactions['t_dat'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seasonal_trans['year'] = transactions['t_dat'].dt.year


In [52]:
seasonal_candidates = seasonal_trans.groupby(['customer_id', 'year'])['article_id'].value_counts()\
    .groupby(['customer_id', 'year']).rank(method='dense', ascending=False) \
    .groupby(['customer_id', 'year']).head(12)\
    .reset_index()\
    .drop(columns=['count'])

seasonal_candidates.to_csv('candidates/baskets_seasonal.csv')

seasonal_candidates.sample(5)

Unnamed: 0,customer_id,year,article_id
966940,882764,2019,782451007
627637,573121,2018,597763001
696181,636208,2019,532578028
478954,436933,2019,806617001
1299094,1187924,2019,728162002


In [33]:
best_seasonal_items = seasonal_trans.groupby(['year'])['article_id'].value_counts()\
    .groupby('year').rank(method='dense', ascending=False)\
    .groupby('year').head(12).rename('seasonal_rank').astype('int8')\
    .reset_index()

best_seasonal_items.to_csv('candidates/best_seasonal.csv')

best_seasonal_items

Unnamed: 0,year,article_id,seasonal_rank
0,2018,539723005,1
1,2018,685687003,2
2,2018,685687002,3
3,2018,685687004,4
4,2018,685687001,5
5,2018,399223001,6
6,2018,573716012,7
7,2018,692454002,8
8,2018,562245001,9
9,2018,610776002,10


## Diversity based -- select items from categories user has not interacted with


In [5]:
# A function to create candidates based on the most popular items in the given category
def not_interacted_with_candidates(t, a, articles_col, k=2):

    # Get unique values of given category
    group_unique_values = a[articles_col].unique()
    group_df = pd.merge(t, a[['article_id', articles_col]])

    # Get k most popular articles in given category
    popular_by_group = group_df.groupby(articles_col)['article_id'].value_counts()\
        .groupby(articles_col).head(k).reset_index()
    popular_by_group = popular_by_group[['article_id', articles_col]]

    # Not interacted category for each customer
    not_interacted_with = group_df.groupby('customer_id')[articles_col].unique()\
        .apply(lambda x: np.setdiff1d(group_unique_values, x))\
        .explode().reset_index()
    
    # Join to create recommendation based on lack of interaction
    candidates = pd.merge(not_interacted_with, popular_by_group, on=articles_col)

    return candidates[['customer_id', 'article_id']]

# Item similarities
 
Run this only once, because it takes a long time to calculate similarity loop as there is a lot of data.

In [78]:
# Normalize data for cosine similarities
df = articles.set_index('article_id')
df = df.drop(columns=['index_code'])
df = (df - df.mean()) / df.std()

In [155]:
# # # For each item, get top 5 most similar items
# # # Only 5, because full matrix was too much data for kernel to handle
# X = df.to_numpy()
# articles_arr = df.index.values
# sims = {}
# for i, row in tqdm(zip(articles_arr, X)):
#     top_n_sim = cosine_similarity(row.reshape(1, -1), X).argsort()[:, -6:-1]
#     article_ids = articles_arr[top_n_sim].reshape(-1)
#     sims[i] = article_ids

0it [00:00, ?it/s]

105542it [32:09, 54.71it/s]


In [180]:
# sims = {int(i):[int(x) for x in v] for i,v in sims.items()}
# with open('../../../data/item_similarities.json', 'w') as f:
#     json.dump(sims, f)

In [181]:
with open('item_similarities.json', 'r') as f:
    sims = json.load(f)

## Items similar to purchased but not actually purchased

In [206]:
purchased_by_user = transactions_train.groupby('customer_id')['article_id'].apply(list)

In [190]:
result = []
for cid, bought in tqdm(purchased_by_user.items()):
    similar_to_bought = []
    for bought_item in bought:
        similar_to_bought += sims[str(bought_item)]
    sim_not_bought = [i for i in similar_to_bought if i not in bought][:10]
    result.append(sim_not_bought)

439368it [00:03, 124024.79it/s]


In [204]:
sim_not_bought = pd.DataFrame({
    'customer_id':purchased_by_user.index.values,
    'sim_not_bought':result
}).explode('sim_not_bought')
sim_not_bought

Unnamed: 0,customer_id,sim_not_bought
0,0,568808003
0,0,560183001
0,0,578487015
0,0,565668003
0,0,565788005
...,...,...
439367,1371977,851108003
439367,1371977,834021003
439367,1371977,824764009
439367,1371977,824764007


In [205]:
sim_not_bought.to_csv('candidates/similar_not_bought.csv')

## User based collaborative filtering
Did not work due to the size of data -- tried many combinations but all of them were crashing as it was too much to handle.

In [207]:
# agg_articles = transactions_train.article_id.value_counts().reset_index()
# agg_articles_top = agg_articles[agg_articles['count']>=2000]
# agg_articles_top

In [208]:
# transactions_train_top = transactions_train[transactions_train.article_id.isin(agg_articles_top.article_id)]

In [209]:
# matrix = transactions_train_top\
#     .groupby(['customer_id', 'article_id']).agg(num_purchased=('article_id', 'count'))\
#     .reset_index()\
#     .pivot_table(index='customer_id', columns='article_id', values='num_purchased')\
#     .fillna(0)
# matrix

In [210]:
# import operator

# def similar_users(user_id, matrix, k=10):
#     # create a df of just the current user
#     user = matrix[matrix.index == user_id]
    
#     # and a df of all other users
#     other_users = matrix[matrix.index != user_id]
    
#     # calc cosine similarity between user and each other user
#     similarities = cosine_similarity(user,other_users)[0].tolist()
    
#     # create list of indices of these users
#     indices = other_users.index.tolist()
    
#     # create key/values pairs of user index and their similarity
#     index_similarity = dict(zip(indices, similarities))
    
#     # sort by similarity
#     index_similarity_sorted = sorted(index_similarity.items(), key=operator.itemgetter(1))
#     index_similarity_sorted.reverse()
    
#     # grab k users off the top
#     top_users_similarities = index_similarity_sorted[:k]
#     users = [u[0] for u in top_users_similarities]
    
#     return users

# def recommend_item(user_index, similar_user_indices, matrix, items=5):
    
#     # load vectors for similar users
#     similar_users = matrix[matrix.index.isin(similar_user_indices)]
#     # calc avg ratings across the 3 similar users
#     similar_users = similar_users.sum(axis=0)
#     # convert to dataframe so its easy to sort and filter
#     similar_users_df = pd.DataFrame(similar_users, columns=['mean'])
    
    
#     # load vector for the current user
#     user_df = matrix[matrix.index == user_index]
#     # transpose it so its easier to filter
#     user_df_transposed = user_df.transpose()
#     # rename the column as 'rating'
#     user_df_transposed.columns = ['rating']
#     # remove any rows without a 0 value. item not watched yet
#     user_df_transposed = user_df_transposed[user_df_transposed['rating']==0]
#     # generate a list of items the user has not seen
#     items_unseen = user_df_transposed.index.tolist()
    
#     # filter avg ratings of similar users for only item the current user has not seen
#     similar_users_df_filtered = similar_users_df[similar_users_df.index.isin(items_unseen)]
#     # order the dataframe
#     similar_users_df_ordered = similar_users_df_filtered.sort_values(by=['mean'], ascending=False)
#     # grab the top n item   
#     top_n_item = similar_users_df_ordered.head(items)
#     top_n_item_indices = top_n_item.index.tolist()
#     # lookup these item in the other dataframe to find names
#     item_information = articles[articles['article_id'].isin(top_n_item_indices)]
    
#     return top_n_item_indices #items
    
# user_candidates = {}
# for user in tqdm(transactions_train_top.customer_id.unique()):
#     user_candidates[user] = recommend_item(4, similar_users(4, matrix), matrix, items=10)



In [211]:
# user_candidates = {int(k):v for k, v in user_candidates.items()}

In [212]:
# import json
# with open('../../../data/candidates/user_based_cf_candidates_v1.json', 'w') as f:
#     json.dump(user_candidates, f)