In [1]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k



In [2]:
users = pd.read_csv("../../../data/processed_data/customers.csv")
items = pd.read_csv("../../../data/processed_data/articles.csv", dtype={'article_id': str})
train = pd.read_csv("../../../data/processed_data/transactions.csv", dtype={'article_id': str}, parse_dates=['t_dat'])

In [3]:
uf = []
col = ['Active']*len(users.Active.unique()) + ['club_member_status']*len(users.club_member_status.unique()) + ['fashion_news_frequency']*len(users.fashion_news_frequency.unique()) + ['age_group']*len(users.age_group.unique()) + ['sex']*len(users.sex.unique())  + ['baby']*len(users.baby.unique())
unique_f1 = list(users.Active.unique()) + list(users.club_member_status.unique()) + list(users.fashion_news_frequency.unique()) + list(users.age_group.unique()) + list(users.sex.unique()) + list(users.baby.unique())
for x,y in zip(col, unique_f1):
    res = str(x)+ ":" +str(y)
    uf.append(res)

In [4]:
fi = []
item_features = ["product_code","product_type_no","graphical_appearance_no","colour_group_code","section_no","garment_group_no","season","sex","target_age_groupe"]
col = []
unique_f1 = []
for i in item_features:
    col += [i] * len(items[i].unique())
    unique_f1 += list(items[i].unique())
for x,y in zip(col, unique_f1):
    res = str(x)+ ":" +str(y)
    fi.append(res)

In [5]:
dataset = Dataset()
dataset.fit(users=users['customer_id'], 
            items=items['article_id'],
            user_features = uf,
            item_features=fi)

num_users, num_topics = dataset.interactions_shape()
print(f'Number of users: {num_users}, Number of topics: {num_topics}.')

Number of users: 1362281, Number of topics: 104547.


In [6]:
def feature_colon_value(features, values):
    result = []
    ll = features
    aa = values
    for x,y in zip(ll,aa):
        res = str(x) +":"+ str(y)
        result.append(res)
    return result

In [7]:
features = ['Active', 'club_member_status', 'fashion_news_frequency', 'age_group', 'sex', 'baby']
ad_subset = users[features]
ad_list = [list(x) for x in ad_subset.values]
feature_list = []
for item in ad_list:
    feature_list.append(feature_colon_value(features, item))

In [8]:
ad_subset = items[item_features]
ad_list = [list(x) for x in ad_subset.values]
feature_list1 = []
for item in ad_list:
    feature_list1.append(feature_colon_value(item_features, item))

In [9]:
user_tuple = list(zip(users.customer_id, feature_list))
user_features = dataset.build_user_features(user_tuple, normalize= False)
item_tuple = list(zip(items.article_id, feature_list1))
item_features = dataset.build_item_features(item_tuple, normalize= False)

In [11]:
train_set = train[train.t_dat<='2020-9-15']
val_set = train[(train.t_dat>='2020-9-16')&(train.t_dat<='2020-9-22')]

(interactions, weights) = dataset.build_interactions(train_set.iloc[:, 1:3].values)
(val_interactions, val_weights) = dataset.build_interactions(val_set.iloc[:, 1:3].values)
print(interactions.shape, val_interactions.shape)

(1362281, 104547) (1362281, 104547)


In [32]:
model = LightFM(loss='warp', no_components=500,
                learning_rate=0.01,                 
                random_state=np.random.RandomState(42))
model.fit(interactions=interactions, epochs=200, verbose=True, num_threads=6)

Epoch: 100%|██████████| 200/200 [14:24:22<00:00, 259.31s/it]  


<lightfm.lightfm.LightFM at 0x14e11dc30>

In [33]:
val_precision = precision_at_k(model, val_interactions, k=12).mean()

print(val_precision)

0.0045179557


In [None]:
import pickle
filename = open('../weights/lightfm4.pth', 'wb') 
pickle.dump(model, filename)

In [26]:
import pickle
with open('../weights/lightfm3.pth', "rb") as fh:
    model = pickle.load(fh)

In [14]:
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping()
inv_uid_map = {v:k for k, v in uid_map.items()}
inv_iid_map = {v:k for k, v in iid_map.items()}
lfn_user = lambda x: uid_map[x]
test_X_m = [lfn_user(tx) for tx in users.customer_id.unique()]

In [34]:
import tqdm
preds = {}
for usr_ in tqdm.tqdm(test_X_m, total = len(test_X_m)):
    m_opt = model.predict(np.array([usr_] * len(iid_map)), np.array(list(iid_map.values())))
    pred = np.argsort(-m_opt)[:12]
    preds[inv_uid_map[usr_]] = ' '.join([inv_iid_map[p] for p in pred]).strip()

100%|██████████| 1362281/1362281 [13:34:18<00:00, 27.88it/s]  


In [35]:
sub = pd.read_csv('../../../data/sample_submission.csv')
sub.prediction = sub.customer_id.apply(lambda x: preds[x] if x in preds.keys() else x)
sub.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601006 0841260003 0568597006 0656719005 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0599580024 0599580049 0590928022 0811835004 03...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0351484002 0723529001 0663713001 0859139002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0732413001 0742079001 0730683001 0757303012 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0399061015 0698286003 0707704003 0692721005 07...


In [36]:
sub.to_csv('../../../data/submissions/lightfm_submission4.csv', index=False)