In [9]:
import pandas as pd
import numpy as np
import json

## Общая предобработка

In [2]:
data1 = pd.read_csv('internship_clickstream_data_1.gzip', compression='gzip')
data2 = pd.read_csv('internship_clickstream_data_2.gzip', compression='gzip')
data3 = pd.read_csv('internship_clickstream_data_3.gzip', compression='gzip')
print(data1.shape, data2.shape, data3.shape)

(30000000, 8) (30000000, 8) (22463346, 8)


In [3]:
# drop duplicates
data1.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data2.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data3.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print(data1.shape, data2.shape, data3.shape)

# filter by date
data1['timestamp'] = pd.to_datetime(data1['timestamp'])
data2['timestamp'] = pd.to_datetime(data2['timestamp'])
data3['timestamp'] = pd.to_datetime(data3['timestamp'])
data1 = data1[data1['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
data2 = data2[data2['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
data3 = data3[data3['timestamp'] < pd.to_datetime('2022-07-10 00:00:00.020000')]
print(data1.shape, data2.shape, data3.shape)

data = pd.concat([data1, data2, data3])
print('after concating:', data.shape)
data.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print('after drop duplicates:', data.shape)

(22585345, 8) (22559623, 8) (17427910, 8)
(5602387, 8) (5352304, 8) (3440925, 8)
after concating: (14395616, 8)
after drop duplicates: (12581794, 8)


In [4]:
# check what offers is in embedding file
embedding_ID = np.load('embeddings_ID.npy')
data = data[data['offer_id'].isin(embedding_ID)]

In [5]:
# take users that have > 5 clicks
while not (data['offer_id'].value_counts()[data['offer_id'].value_counts() <= 5].empty) or \
      not (data['uid'].value_counts()[data['uid'].value_counts() <= 5].empty):
    offer_ids = data['offer_id'].value_counts()[data['offer_id'].value_counts() > 5].index
    data = data[data['offer_id'].isin(offer_ids)]
    uids = data['uid'].value_counts()[data['uid'].value_counts() > 5].index
    data = data[data['uid'].isin(uids)]
print(data.shape)

(8663647, 8)


## Для обычной модели SASRec

In [6]:
# encode, start with 1 - SASRec condition
offer_encoder = {off: ind for ind, off in enumerate(data['offer_id'].unique())}
data['offer_id_enc'] = data['offer_id'].map(offer_encoder) + 1
uid_encoder = {uid: ind for ind, uid in enumerate(data['uid'].unique())}
data['uid_enc'] = data['uid'].map(uid_encoder) + 1

# sort by time and user id
data.sort_values(by=['uid_enc', 'timestamp'], inplace=True)
print(data.shape)
data.head(5)

(8663647, 10)


Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd,offer_id_enc,uid_enc
13567692,2022-07-03 10:31:36.281,172f4ac946ff4677,49913337,ios,OpenOfferScreen,RecommendationsScreen,271821092,2022-07-03,29929,1
13567695,2022-07-03 10:31:53.808,57e6bd3399b94d2f,49913337,ios,OpenOfferScreen,RecommendationsScreen,271133138,2022-07-03,7028,1
60022,2022-07-03 10:32:56.753,b2071702a85d48e3,49913337,ios,OpenOfferScreen,RecommendationsScreen,273896418,2022-07-03,4712,1
29665848,2022-07-03 10:33:42.214,64508d9634914757,49913337,ios,OpenOfferScreen,RecommendationsScreen,274877165,2022-07-03,45057,1
13567714,2022-07-03 10:36:08.711,85aa6497a7484120,49913337,ios,OpenOfferScreen,RecommendationsScreen,274931249,2022-07-03,19744,1


In [7]:
# для параметра модели
len(set(data['offer_id_enc']))  

388607

In [8]:
# create .txt file for input to model
data[['uid_enc', 'offer_id_enc']].to_csv('out.txt', sep='\t', header=False, index=False)

## Для модели SASRec с эмбеддингами

In [11]:
data.drop(['uid_enc', 'offer_id_enc'], axis=1, inplace=True)

In [12]:
embeddings = np.load('embeddings_num_only.npy')
embeddings.shape

(2778005, 25)

In [13]:
def filter_embed(embeds, offers):
    """
        Filter embedding file, leaves only those that are in the dataset
        INPUT: embeds, offers - numpy arrays (of lists)
        OUTPUT: modified dataset
    """
    emb_id = embeds[:, 0].astype(int)
    mask = np.isin(emb_id, offers).reshape((embeds.shape[0], 1))
    embeds = np.hstack((embeds, mask))
    embeds = embeds[embeds[:, -1] != 0]
    return embeds[:, :-1]
offers = list(set(data['offer_id']))
embeddings = filter_embed(embeddings, offers)

In [14]:
# encode
offer_encoder = dict(zip(embeddings[:, 0], list(range(1, embeddings.shape[0] + 1))))
data['offer_id_enc'] = data['offer_id'].map(offer_encoder)
uid_encoder = {uid: ind for ind, uid in enumerate(data['uid'].unique())}
data['uid_enc'] = data['uid'].map(uid_encoder) + 1
# sort by time and user id
data['timestamp'] = pd.to_datetime(data['timestamp'])
data.sort_values(by=['uid_enc', 'timestamp'], inplace=True)
print(data.shape)
data.head(5)

(8663647, 10)


Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd,offer_id_enc,uid_enc
13567692,2022-07-03 10:31:36.281,172f4ac946ff4677,49913337,ios,OpenOfferScreen,RecommendationsScreen,271821092,2022-07-03,161485,1
13567695,2022-07-03 10:31:53.808,57e6bd3399b94d2f,49913337,ios,OpenOfferScreen,RecommendationsScreen,271133138,2022-07-03,183091,1
60022,2022-07-03 10:32:56.753,b2071702a85d48e3,49913337,ios,OpenOfferScreen,RecommendationsScreen,273896418,2022-07-03,347437,1
29665848,2022-07-03 10:33:42.214,64508d9634914757,49913337,ios,OpenOfferScreen,RecommendationsScreen,274877165,2022-07-03,109799,1
13567714,2022-07-03 10:36:08.711,85aa6497a7484120,49913337,ios,OpenOfferScreen,RecommendationsScreen,274931249,2022-07-03,200947,1


In [15]:
sorted(list(set(data['offer_id_enc'])))[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [16]:
# create .txt file for input to model
data[['uid_enc', 'offer_id_enc']].to_csv('out_e.txt', sep='\t', header=False, index=False)

In [17]:
# save embeddings file
np.save('embeddings_filtered_num_only.npy', embeddings)  

In [18]:
embeddings.shape

(388607, 25)

In [19]:
with open('mapping_withEmb.json', 'w') as f:
    json.dump(offer_encoder, f)