In [43]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import json
import warnings
import scipy.sparse as sp
from collections import defaultdict
import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

In [70]:
train = pd.read_pickle('./data/joins.pkl')

In [72]:
tops_ts = train[~train.ts.isna()].element_uid.values
tops = train.element_uid.value_counts()
top_down = tops[tops < 100].index.values.tolist()
top_down = list(set(top_down) & set(tops_ts))

In [73]:
(train.ts.max() - train.ts.min()) / 858183.3333333334

2.9999977657440606

In [106]:
len(top_down)

2940

In [76]:
with open('./data/test_users.json', 'r') as f:
    test_users = set(json.load(f)['users'])

In [77]:
%%time
transactions = pd.read_csv('./data/transactions.csv',
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'consumption_mode': 'category',
        'ts': np.float64,
        'watched_time': np.uint64,
        'device_type': np.uint8,
        'device_manufacturer': np.uint8
    }
)

Wall time: 59.9 s


In [110]:
len(test_users)

50000

In [109]:
train[train.user_uid.isin(test_users)].user_uid.nunique()

50000

In [112]:
train.target.value_counts()

1.0    5849491
0.0     494453
Name: target, dtype: int64

In [134]:
train = train[(train.target == 1) | train.user_uid.isin(test_users)]


In [135]:
interaction = (~train.target.isna() + train.target.fillna(0)) \
+ (~train.is_bookmarks.isna() + train.is_bookmarks.fillna(0)) \
+ (~train.rating.isna())

In [136]:
filtered_elements = defaultdict(set)

for user_uid, element_uid in tqdm.tqdm(transactions.loc[:, ['user_uid', 'element_uid']].values):
    if user_uid not in test_users:
        continue
    filtered_elements[user_uid].add(element_uid)

100%|██████████| 9643012/9643012 [00:17<00:00, 546855.46it/s]


In [137]:
train['user_uid'] = train['user_uid'].astype('category')
train['element_uid'] = train['element_uid'].astype('category')

ratings_matrix = sp.coo_matrix(
    (interaction.values.astype(np.float32),
        (
            train['element_uid'].cat.codes.copy(),
            train['user_uid'].cat.codes.copy()
        )
    )
).tocsr()

In [143]:
ratings_matrix.nnz

1461936

In [144]:
sparsity = ratings_matrix.nnz / (ratings_matrix.shape[0] * ratings_matrix.shape[1])
print('Sparsity: %.6f' % sparsity)

Sparsity: 0.000340


In [145]:
from implicit.nearest_neighbours import TFIDFRecommender

model = TFIDFRecommender(1000)
model.fit(ratings_matrix)

100%|██████████| 8942/8942 [00:00<00:00, 22011.17it/s]


In [146]:
ratings_matrix_T = ratings_matrix.T.tocsr()

In [196]:
s = pd.Series([4128, 3757, 4154, 1227, 1911, 549, 4082, 9341, 3791, 5003])

s.astype('category').cat.categories


Int64Index([549, 1227, 1911, 3757, 3791, 4082, 4128, 4154, 5003, 9341], dtype='int64')

In [199]:
user_uid_to_cat = dict(zip(
    train['user_uid'].cat.categories,
    range(len(train['user_uid'].cat.categories))
))

In [205]:
len(set(user_uid_to_cat.keys()))

480696

In [207]:
range(len(train['element_uid'].cat.categories))

range(0, 8942)

In [148]:
element_uid_to_cat = dict(zip(
    train['element_uid'].cat.categories,
    range(len(train['element_uid'].cat.categories))
))

In [149]:
filtered_elements_cat = {k: [element_uid_to_cat.get(x, None) for x in v] for k, v in filtered_elements.items()}

In [150]:
ratings_matrix.shape

(8942, 480668)

In [151]:
result = {}

for user_uid in tqdm.tqdm_notebook(test_users):
    # transform user_uid to model's internal user category
    try:
        user_cat = user_uid_to_cat[user_uid]
    except LookupError:
        continue
    
    # perform inference
    recs = model.recommend(
        user_cat,
        ratings_matrix_T,
        N=100,
        filter_already_liked_items=True,
        filter_items=filtered_elements_cat.get(user_uid, set())
    )
    
    # drop scores and transform model's internal elelemnt category to element_uid for every prediction
    # also convert np.uint64 to int so it could be json serialized later
    user_res = [int(train['element_uid'].cat.categories[i]) for i, _ in recs]
    
    user_res_set = set(user_res)
    for movie in top_down:
        if movie in user_res_set:
            user_res.remove(movie)
            user_res_set.remove(movie)
            
    result[user_uid] = user_res[:20]

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))




In [152]:
(np.array(list(map(len, result.values())))).min()

8

In [153]:
with open('data/interaction_result.json', 'w') as f:
    json.dump(result, f)

In [131]:
len(result)

50000

In [132]:
len(test_users)

50000