# Recurrent Neural Network based Recommender System



## Data Loading

In [1]:
import numpy as np 
import pandas as pd

In [9]:

df_train = pd.read_csv('./data/transactions.csv')
df_train.head()

Unnamed: 0,element_uid,user_uid,consumption_mode,ts,watched_time,device_type,device_manufacturer
0,3336,5177,S,44305180.0,4282,0,50
1,481,593316,S,44305180.0,2989,0,11
2,4128,262355,S,44305180.0,833,0,50
3,6272,74296,S,44305180.0,2530,0,99
4,5543,340623,P,44305180.0,6282,0,50


In [11]:
df_train = df_train.sort_values('ts').reset_index(drop=True)

In [23]:
df_train.head()

Unnamed: 0,element_uid,user_uid,consumption_mode,ts,watched_time,device_type,device_manufacturer,categ_id
0,6189,283774,S,41730630.0,19586,0,11,5022
1,6099,59148,S,41730630.0,6831,0,50,4947
2,8888,50431,S,41730630.0,5763,0,11,7222
3,8436,458827,S,41730630.0,8360,0,50,6848
4,2252,180823,S,41730630.0,2503,0,11,1808


In [13]:
import os
import json

DATA_PATH = '.'
with open(os.path.join(DATA_PATH, 'test_users.json'), 'r') as f:
    test_users = set(json.load(f)['users'])

## Data Preprocessing

In [15]:
top_20 = df_train['element_uid'].value_counts()[:20].index.tolist()

In [16]:
df_train.element_uid = df_train.element_uid.astype('category')
df_train['categ_id'] = df_train.element_uid.cat.codes + 1

In [24]:
df_train.head()

Unnamed: 0,element_uid,user_uid,consumption_mode,ts,watched_time,device_type,device_manufacturer,categ_id
0,6189,283774,S,41730630.0,19586,0,11,5022
1,6099,59148,S,41730630.0,6831,0,50,4947
2,8888,50431,S,41730630.0,5763,0,11,7222
3,8436,458827,S,41730630.0,8360,0,50,6848
4,2252,180823,S,41730630.0,2503,0,11,1808


In [25]:
element_uid_to_cat = dict(zip(
    df_train.element_uid.cat.categories,
    range(1, len(df_train.element_uid.cat.categories) + 1)
))

# Let`s define inverse transform dictionary
cat_to_element_uid = dict(zip(
    range(1, len(df_train.element_uid.cat.categories) + 1),
    df_train.element_uid.cat.categories
))

# Assigning most popular film index to inverse transform of zero padding value
cat_to_element_uid[0] = 2714

In [26]:
import tqdm
from collections import defaultdict

filtered_elements = defaultdict(set)

for user_uid, element_uid in tqdm.tqdm(df_train.loc[:, ['user_uid', 'element_uid']].values):
    if user_uid not in test_users:
        continue
    filtered_elements[user_uid].add(element_uid)

100%|██████████| 9643012/9643012 [00:13<00:00, 716007.56it/s]


In [27]:
filtered_elements_cat = {k: [element_uid_to_cat.get(x, None) for x in v] for k, v in filtered_elements.items()}

In [30]:

tqdm.tqdm.pandas()
purchases = df_train.groupby('user_uid')['categ_id'].progress_apply(list)

100%|██████████| 499663/499663 [00:48<00:00, 10249.28it/s]


In [31]:
purchases.head()

user_uid
0    [4973, 6631, 1046, 1249, 378, 5862, 6645, 1604...
1    [1348, 3342, 1802, 6687, 5687, 4975, 4406, 563...
2                              [617, 7127, 1535, 7480]
3    [6904, 7127, 7113, 1125, 3818, 3407, 7562, 362...
5                  [668, 4206, 2181, 1272, 2730, 1547]
Name: categ_id, dtype: object

In [32]:
# Some statistics
print('Median: {}\nMean: {}\nMax: {}'.format(
    purchases.apply(len).median(), purchases.apply(len).mean(), purchases.apply(len).max()))

Median: 10.0
Mean: 19.299031547262857
Max: 1156


In [33]:
# We will use users with 5 and more wathced films
purchases2use = purchases[purchases.apply(len) >= 5]

In [36]:
len(purchases2use[1])

20

In [38]:
maxlen = 18
X = []
y = []

def slice_purchase(seq, num_slices):
    for i in range(1, num_slices):
        X.append(seq[-(i+maxlen): -i])
        y.append(seq[-i])
        
slice_purchase(purchases2use[1], 11)

In [None]:
purchases2use[1][-18: ]

In [14]:
maxlen = 40 # Length of purchases in X
X = []
y = []

def slice_purchase(seq, num_slices):
    for i in range(1, num_slices):
        X.append(seq[-(i+maxlen): -i])
        y.append(seq[-i])
        
for seq in tqdm.tqdm(purchases2use):
    if len(seq) <= 5:
        slice_purchase(seq, 2)
    elif len(seq) <= 6:
        slice_purchase(seq, 3)
    elif len(seq) <= 8:
        slice_purchase(seq, 4)
    elif len(seq) <= 12:
        slice_purchase(seq, 6)
    elif len(seq) <= 16:
        slice_purchase(seq, 8)
    elif len(seq) <= 20:
        slice_purchase(seq, 11)
    elif len(seq) <= 26:
        slice_purchase(seq, 16)
    else:
        slice_purchase(seq, 23)

100%|██████████| 396121/396121 [00:10<00:00, 36590.30it/s]


In [15]:
len(X), len(y)

(3932916, 3932916)

In [16]:
lens = [len(x) for x in X]
max(lens), min(lens), np.mean(lens), np.median(lens)

(18, 4, 14.428601577048683, 18.0)

In [17]:
from keras.preprocessing.sequence import pad_sequences

# We should pad our sequences with 0 values, so they all will have the same length
X = pad_sequences(X, maxlen=maxlen)
y = np.array(y)
X.shape, y.shape

Using TensorFlow backend.


((3932916, 18), (3932916,))

## Let's define the model architecture

In [18]:
from keras.layers import Input, Embedding, SpatialDropout1D, CuDNNLSTM, Dropout, Dense
from keras.models import Model

# Let's set random seed
import tensorflow as tf
tf.set_random_seed(42)
np.random.seed(42)

In [19]:
df_train.categ_id.unique().size + 1

8297

In [20]:
max_features = df_train.categ_id.unique().size + 1
embed_size = 64

def lstm128():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size)(inp)
    x = SpatialDropout1D(0.05)(x)
    x = CuDNNLSTM(128, return_purchases=False)(x)
    x = Dropout(0.02)(x)
    outp = Dense(max_features, activation="softmax")(x)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop',
                  metrics=['sparse_categorical_accuracy'])
    return model

In [None]:
# Let's train our film recommender system
model = lstm128()
model.fit(X, y, batch_size=2048*4, epochs=25, verbose=True, validation_split=0.01, shuffle=True)

## Prediction

In [23]:
purchases_test = purchases.apply(lambda x: x[-maxlen:])
purchases_test = purchases_test.apply(lambda x: [0 for i in range(maxlen - len(x))] + x)

In [24]:
test_users_in_purchases = sorted(test_users & set(purchases_test.index))

In [25]:
X_test = np.array(purchases_test[test_users_in_purchases].tolist())

In [None]:
result = {}

for user_uid, x in tqdm.tqdm(zip(test_users_in_purchases, X_test), total=len(test_users_in_purchases)):
    curr_preds = model.predict(np.array([x]))[0]
    curr_preds[list(filtered_elements_cat[user_uid])] = 0
    curr_preds = np.argsort(-curr_preds)[:20]
    curr_preds = [cat_to_element_uid[x] for x in curr_preds]
    
    # drop scores and transform model's internal elelemnt category to element_uid for every prediction
    # also convert np.uint64 to int so it could be json serialized later
    result[user_uid] = [int(p) for p in curr_preds]

In [None]:
for test_user in list(test_users):
    if test_user in result.keys():
        continue
    else:
        result[test_user] = top_20

In [None]:
with open('answer.json', 'w') as f:
    json.dump(result, f)
    
len(result)