<a href="https://colab.research.google.com/github/MariaZharova/test_rec_systems/blob/main/Our_microsoft_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import re
import sys
import os
import scrapbook as sb
from tempfile import TemporaryDirectory
import numpy as np
import pandas as pd 

from collections import defaultdict
import category_encoders as ce
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.datasets.amazon_reviews import get_review_data
from recommenders.datasets.split_utils import filter_k_core

# Transformer Based Models
from recommenders.models.sasrec.model import SASREC
from recommenders.models.sasrec.ssept import SSEPT

# Sampler for sequential prediction
from recommenders.models.sasrec.sampler import WarpSampler
from recommenders.models.sasrec.util import SASRecDataSet

# Evaluation
from recommenders.evaluation.python_evaluation import precision_at_k

In [2]:
! pip install scrapbook category_encoders recommenders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scrapbook
  Downloading scrapbook-0.5.0-py3-none-any.whl (34 kB)
Collecting category_encoders
  Downloading category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 4.9 MB/s 
[?25hCollecting recommenders
  Downloading recommenders-1.1.0-py3-none-manylinux1_x86_64.whl (335 kB)
[K     |████████████████████████████████| 335 kB 53.7 MB/s 
Collecting papermill
  Downloading papermill-2.3.4-py3-none-any.whl (37 kB)
Collecting pandera[strategies]>=0.6.5
  Downloading pandera-0.9.0-py3-none-any.whl (197 kB)
[K     |████████████████████████████████| 197 kB 64.6 MB/s 
Collecting lightfm<2,>=1.15
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 54.0 MB/s 
Collecting pyyaml<6,>=5.4.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 k

In [13]:
my_data = pd.read_csv('internship_clickstream_data.csv')
print(my_data.shape) # (7458216, 8)
my_data = my_data.iloc[:20000]
print(my_data.shape)
my_data.head()

(1457271, 8)
(20000, 8)


Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd
0,2022-06-29 01:04:03,4b45e714d01842a7,16650505,ios,OpenOfferScreen,SearchResultsList,274266785.0,2022-06-29
1,2022-06-29 01:06:10,e688e3349b35430f,92346837,android,OpenOfferScreen,MapScreen,270671363.0,2022-06-29
2,2022-06-29 01:08:48,97c52e7a2e574f44,0bf81f10-ee3a-4543-a9ee-2bd12b4e4ec6,android,OpenOfferScreen,Undefined,272968099.0,2022-06-29
3,2022-06-29 01:09:39,d52e99bc7f7f4db8,84081279,ios,OpenOfferScreen,SearchResultsList,268313499.0,2022-06-29
4,2022-06-29 01:12:50,d49bf3352f64401c,0bf81f10-ee3a-4543-a9ee-2bd12b4e4ec6,android,OpenOfferScreen,Undefined,255933042.0,2022-06-29


In [15]:
my_data.dropna(inplace=True)
my_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19604 entries, 0 to 19999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   timestamp   19604 non-null  object 
 1   hit_id      19604 non-null  object 
 2   uid         19604 non-null  object 
 3   platform    19604 non-null  object 
 4   event_name  19604 non-null  object 
 5   screen      19604 non-null  object 
 6   offer_id    19604 non-null  float64
 7   ptn_dadd    19604 non-null  object 
dtypes: float64(1), object(7)
memory usage: 1.3+ MB


In [16]:
# encode, start with 1
offer_encoder = {off: ind for ind, off in enumerate(my_data['offer_id'].unique())}
my_data['offer_id_enc'] = my_data['offer_id'].map(offer_encoder) + 1
uid_encoder = {uid: ind for ind, uid in enumerate(my_data['uid'].unique())}
my_data['uid_enc'] = my_data['uid'].map(uid_encoder) + 1

# sort by user id and time iteraction
my_data['timestamp'] = pd.to_datetime(my_data['timestamp'])
my_data.sort_values(by=['uid_enc', 'timestamp'], inplace=True)
my_data.head(10)

Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd,offer_id_enc,uid_enc
6509,2022-06-28 04:53:23,0c2b184e62f04368,16650505,ios,OpenOfferScreen,MapScreen,249093727.0,2022-06-28,6070,1
6513,2022-06-28 04:54:23,289db0a4dfc344bf,16650505,ios,OpenOfferScreen,MapScreen,274743850.0,2022-06-28,6074,1
6532,2022-06-28 04:59:12,21df94e71e4b4df7,16650505,ios,OpenOfferScreen,MapScreen,264617112.0,2022-06-28,6093,1
0,2022-06-29 01:04:03,4b45e714d01842a7,16650505,ios,OpenOfferScreen,SearchResultsList,274266785.0,2022-06-29,1,1
1,2022-06-29 01:06:10,e688e3349b35430f,92346837,android,OpenOfferScreen,MapScreen,270671363.0,2022-06-29,2,2
573,2022-06-30 04:02:29,cce4c352289c4ab1,92346837,android,OpenOfferScreen,MapScreen,274001335.0,2022-06-30,551,2
2,2022-06-29 01:08:48,97c52e7a2e574f44,0bf81f10-ee3a-4543-a9ee-2bd12b4e4ec6,android,OpenOfferScreen,Undefined,272968099.0,2022-06-29,3,3
4,2022-06-29 01:12:50,d49bf3352f64401c,0bf81f10-ee3a-4543-a9ee-2bd12b4e4ec6,android,OpenOfferScreen,Undefined,255933042.0,2022-06-29,5,3
3,2022-06-29 01:09:39,d52e99bc7f7f4db8,84081279,ios,OpenOfferScreen,SearchResultsList,268313499.0,2022-06-29,4,4
5,2022-06-29 01:19:44,c5d68cf595514deb,045d4ebe-c599-4db0-9923-3b0eee03eaa8,android,OpenOfferScreen,NewBuildingOffersList,269300331.0,2022-06-29,6,5


In [17]:
# create .txt file for input to model
my_data[['offer_id_enc',	'uid_enc']].to_csv('out.txt', sep="\t", header=False, index=False)

In [18]:
# create specia; data format for SAS
data = SASRecDataSet(filename='out.txt', col_sep='\t')
# split into train, test and validation
data.split()

In [19]:
# model variables
num_epochs = 5
batch_size = 128
RANDOM_SEED = 100  # Set None for non-deterministic result

lr = 0.001             # learning rate
maxlen = 50            # maximum sequence length for each user
num_blocks = 2         # number of transformer blocks
hidden_units = 100     # number of units in the attention calculation
num_heads = 1          # number of attention heads
dropout_rate = 0.1     # dropout rate
l2_emb = 0.0           # L2 regularization coefficient
num_neg_test = 100     # number of negative examples per positive example
model_name = 'ssept'  # 'sasrec' or 'ssept'


In [20]:
# sample negative examples
sampler = WarpSampler(data.user_train, data.usernum, data.itemnum, batch_size=batch_size, maxlen=maxlen, n_workers=3)

In [21]:
model = SASREC(item_num=data.itemnum,
               seq_max_len=maxlen,
               num_blocks=num_blocks,
               embedding_dim=hidden_units,
               attention_dim=hidden_units,
               attention_num_heads=num_heads,
               dropout_rate=dropout_rate,
               conv_dims = [100, 100],
               l2_reg=l2_emb,
               num_neg_test=num_neg_test)

In [22]:
with Timer() as train_time:
    t_test = model.train(data, sampler, num_epochs=1, batch_size=200, lr=lr, val_epoch=6)




epoch: 1, test (NDCG@10: 0.675666701962054, HR@10: 0.7068965517241379)


In [25]:
t_test # встроенно оценивает ndcg@10 (Normalized discounted cumulative gain) и Hit@10

(0.675666701962054, 0.7068965517241379)

In [71]:
print(data.usernum)
print(model.num_neg_test)

17980
100


In [81]:
import random
from tqdm import tqdm

def get_predictions(data):
    """
        Модифицированный метод evaluation класса SASREC,
        главный результат - получаем предсказания для всех пользователей из датасета
    """
    usernum = data.usernum # max № of user
    itemnum = data.itemnum # max № of item
    train = data.user_train
    valid = data.user_valid
    test = data.user_test

    pred_dict = {}
    all_inputs = {}
    
    # насэмплим рандомных 10000 пользователей (или меньше, если их разнообразие небольшое:))
    if usernum > 10000:
        users = random.sample(range(1, usernum + 1), 10000)
    else:
        users = range(1, usernum + 1)
    
    # для каждого пользователя делаем оценку
    for u in tqdm(users, ncols=70, leave=False, unit="b"):

        if len(train[u]) < 1 or len(test[u]) < 1: # если для текущего пользователя нет ничего в train или test => continue
            continue
        # для input_seq
        seq = np.zeros([model.seq_max_len], dtype=np.int32)
        idx = model.seq_max_len - 1
        seq[idx] = valid[u][0]
        idx -= 1
        for i in reversed(train[u]): 
            seq[idx] = i
            idx -= 1
            if idx == -1: # если нет в train и valid => break
                break
        # для candidate
        rated = set(train[u]) # то, что оценил пользователь, из train'a
        rated.add(0)
        item_idx = [test[u][0]] # первым в последовательность помещаем тык из теста
        for _ in range(model.num_neg_test): # размер последовательностей получается фиксированный, задаётся в параметрах модели при инициализации
                                            # https://github.com/microsoft/recommenders/blob/main/examples/00_quick_start/sasrec_amazon.ipynb
            t = np.random.randint(1, itemnum + 1)
            while t in rated:
                t = np.random.randint(1, itemnum + 1) # генерим рандомно (?)
            item_idx.append(t)

        inputs = {}
        inputs["user"] = np.expand_dims(np.array([u]), axis=-1) # просто номер пользователя
        inputs["input_seq"] = np.array([seq]) # входная последовательность что тыкнул пользователь - ИЗ TRAIN И VALID!
        inputs["candidate"] = np.array([item_idx]) # объявки, для которых будем вычислять логиты
        all_inputs[inputs["user"][0][0]] = inputs # словарик для всех inputs

        # добавляем каждого пользователя в словарик предиктов
        pred_dict[inputs["user"][0][0]] = model.predict(inputs)

    return pred_dict, all_inputs

In [82]:
pred_dict, inputs = get_predictions(data)



In [83]:
# посмотрим, что input_seq - это то, что нажал пользователь из train и valid
display(inputs[4910]) # user_id для примера
print("\n")
print("Из train и valid:", data.user_train[4910], data.user_valid[4910])
print("Из test:", data.user_test[4910])

{'candidate': array([[10150, 10213,  3446,  1508,   986,  8025,   716,  4030,  3987,
           606,  1454,  5848,   744,  9603,   668,  2434,  3792,   241,
          6374,  7214,  3212, 10342,  5577,  4895,  7185,  4743,  6529,
          8599, 10480,  2924,  8039,  7764,  2966,  5807,  1502,  5442,
          6341,  7813,  4870,  5801,  6580,  8081,  3784, 10187,  5409,
           829,  2053,  3782,  9930,  1813,  7329,  6495,  3827,  8584,
          4067,  7398,  5519,  8239,  7717,  9766,   662,  7111,   390,
          4353,  8818,  1912,  4843,   606,  8499,  9222,  1847,  1741,
           280,  1150,  3557,  8079,  7434, 10245,  1900,  5336,  5211,
          5301,  1436,  5256,  8644,  1704,   240,  3692,  8428,  8840,
          2158,  8917,  6181,  5534,  2349,  4694,  2825,  2826,  6091,
          2847,  8540]]),
 'input_seq': array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
      



Из train и valid: [2906] [3408]
Из test: [10150]


In [85]:
print(len(pred_dict)) # осталось столько пользователей (для которых было что-то в train && test)
print(inputs[4910]['candidate'].shape) # всего 100 штук, сколько и заказывали в num_neg_test при создании модели

106
(1, 101)


In [80]:
# посмотреть на пример что в pred_dict
pred_dict[17546]

<tf.Tensor: shape=(1, 101), dtype=float32, numpy=
array([[ 1.7349093 , -0.2380237 ,  0.2768051 , -0.8058723 , -0.3629349 ,
         0.4124116 , -0.6360039 , -0.33366135,  0.11179381, -0.48332027,
         0.0552212 ,  0.08918025, -0.23705333,  0.04897553, -0.25260627,
        -0.09259616,  0.11503328,  0.08109427, -0.558318  , -0.6279186 ,
         0.16111474, -0.00711066,  0.17731744, -0.48438722, -0.1569303 ,
        -0.02135493,  0.03264721, -0.32391164,  0.61671615,  0.10933504,
        -0.19701293, -0.37415287, -0.13526347, -0.13832693,  0.05130062,
         0.33480537, -0.5105399 , -0.12325121, -0.5080016 , -0.07319078,
         0.33501405, -0.08954404, -0.16972607,  0.46379918,  0.17316495,
        -0.99398756,  0.07218778, -0.18870717, -0.40597832, -0.2768316 ,
         0.22513886,  0.228473  ,  0.0598809 ,  0.32786965, -0.06045763,
         0.14715154, -0.04422484, -0.1393615 , -0.32939407,  0.2582508 ,
         0.1938943 , -0.06687552, -0.22473651,  0.19400313,  0.20952146,
 

In [107]:
# нужно связать значения логитов и кандидатов, выбрать самые вероятные значения и проверить, были ли они в тесте
# сразу расчёт precision@k
k = 5
fin_prec = 0
for key in pred_dict.keys():
  tmp = np.stack((pred_dict[key][0], inputs[key]['candidate'][0]), axis=-1)
  tmp = tmp[tmp[:, 0].argsort()] # сортировка по вероятностям (логитам)
  topk = tmp[-k:, 1].astype(int)
  # проверим, есть ли этот топk в тесте
  tmp_prec = 0
  for val in topk:
    if val in data.user_test[key]:
      tmp_prec += 1/k
  fin_prec += tmp_prec

fin_prec /= len(pred_dict.keys())
print("ОТВЕТ", fin_prec)

ОТВЕТ 0.13018867924528285


In [101]:
np.array(pred_dict[4910][0])#.shape

array([-0.67173696,  1.2501149 , -0.13802752, -0.22608724,  0.67211986,
       -0.00418808,  0.4270192 ,  0.21722561, -0.06826862,  0.15682304,
        0.13290925, -0.6735867 , -0.18091708,  0.00767247, -0.2485054 ,
       -0.33663225, -0.65954155, -0.2612238 , -0.22935894,  0.5137502 ,
       -0.35419914,  0.16644403, -0.06033632, -0.6661259 ,  0.19264936,
        0.4808402 , -0.7009594 , -0.46484882,  0.31086144, -0.0777406 ,
       -0.12569988,  0.24556428, -0.61868757,  0.10692924,  0.12175551,
       -0.33630934, -0.14961345, -0.6079842 ,  0.19489548, -0.53407997,
        0.44796312,  0.18142511, -0.5976605 , -0.13316722,  0.12533325,
        0.0065351 ,  0.27622864,  0.4820524 , -0.13178283,  0.08281772,
        0.13179572,  0.48131338,  0.17510441,  0.17328207, -0.2854422 ,
       -0.19312422, -0.09309757,  0.01821036,  0.3021482 , -0.08094405,
        0.27525285, -0.0200531 , -0.08224851, -0.313321  , -0.18752854,
        0.38367504,  0.08240104,  0.15682304,  0.31643727, -0.24

In [97]:
inputs[4910]['candidate'][0]#.shape

array([10150, 10213,  3446,  1508,   986,  8025,   716,  4030,  3987,
         606,  1454,  5848,   744,  9603,   668,  2434,  3792,   241,
        6374,  7214,  3212, 10342,  5577,  4895,  7185,  4743,  6529,
        8599, 10480,  2924,  8039,  7764,  2966,  5807,  1502,  5442,
        6341,  7813,  4870,  5801,  6580,  8081,  3784, 10187,  5409,
         829,  2053,  3782,  9930,  1813,  7329,  6495,  3827,  8584,
        4067,  7398,  5519,  8239,  7717,  9766,   662,  7111,   390,
        4353,  8818,  1912,  4843,   606,  8499,  9222,  1847,  1741,
         280,  1150,  3557,  8079,  7434, 10245,  1900,  5336,  5211,
        5301,  1436,  5256,  8644,  1704,   240,  3692,  8428,  8840,
        2158,  8917,  6181,  5534,  2349,  4694,  2825,  2826,  6091,
        2847,  8540])

In [105]:
data.user_test

{6070: [6404],
 6074: [],
 6093: [],
 1: [],
 2: [],
 551: [],
 3: [],
 5: [],
 4: [],
 6: [],
 7: [],
 8: [],
 9: [],
 10: [],
 494: [],
 11: [],
 13: [8065],
 491: [],
 12: [],
 482: [],
 14: [],
 15: [],
 16: [],
 17: [],
 18: [],
 19: [],
 703: [],
 20: [],
 488: [],
 489: [],
 21: [],
 22: [],
 23: [],
 24: [],
 3461: [],
 1674: [],
 25: [],
 3459: [],
 3560: [],
 26: [],
 27: [],
 28: [],
 29: [],
 30: [],
 31: [],
 33: [],
 57: [],
 59: [],
 102: [],
 106: [],
 113: [],
 32: [],
 34: [],
 35: [],
 37: [],
 38: [],
 48: [],
 50: [],
 52: [],
 53: [],
 55: [],
 36: [],
 39: [],
 66: [],
 40: [],
 41: [],
 42: [],
 480: [],
 483: [],
 43: [],
 45: [],
 44: [],
 481: [],
 486: [],
 487: [],
 46: [],
 479: [],
 484: [],
 47: [],
 49: [],
 51: [],
 54: [],
 56: [],
 58: [],
 75: [],
 93: [],
 60: [],
 70: [],
 61: [],
 62: [],
 63: [],
 98: [],
 64: [],
 65: [],
 67: [],
 74: [],
 104: [],
 68: [],
 81: [],
 69: [],
 73: [],
 80: [],
 17363: [],
 71: [],
 72: [],
 91: [],
 76: [],
 77