In [12]:
import pandas as pd
import numpy as np

## Общая предобработка

In [3]:
data1 = pd.read_csv('internship_clickstream_data_1.gzip', compression='gzip')
data2 = pd.read_csv('internship_clickstream_data_2.gzip', compression='gzip')
data3 = pd.read_csv('internship_clickstream_data_3.gzip', compression='gzip')
print(data1.shape, data2.shape, data3.shape)

(30000000, 8) (30000000, 8) (22463346, 8)


In [4]:
# drop duplicates
data1.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data2.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
data3.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print(data1.shape, data2.shape, data3.shape)

data = pd.concat([data1, data2, data3])
print('after concating:', data.shape)
data.drop_duplicates(subset=['uid', 'offer_id'], inplace=True)
print('after drop duplicates:', data.shape)

(22585345, 8) (22559623, 8) (17427910, 8)
after concating: (62572878, 8)
after drop duplicates: (52774654, 8)


In [5]:
# take users that have > 5 clicks
while not (data['offer_id'].value_counts()[data['offer_id'].value_counts() <= 5].empty) or \
      not (data['uid'].value_counts()[data['uid'].value_counts() <= 5].empty):
    offer_ids = data['offer_id'].value_counts()[data['offer_id'].value_counts() > 5].index
    data = data[data['offer_id'].isin(offer_ids)]
    uids = data['uid'].value_counts()[data['uid'].value_counts() > 5].index
    data = data[data['uid'].isin(uids)]
print(data.shape)

(48045898, 8)


## Для обычной модели SASRec

In [13]:
embedding_ID = np.load('embeddings_ID.npy')
data = data[data['offer_id'].isin(embedding_ID)]

In [15]:
# one more time
while not (data['offer_id'].value_counts()[data['offer_id'].value_counts() <= 5].empty) or \
      not (data['uid'].value_counts()[data['uid'].value_counts() <= 5].empty):
    offer_ids = data['offer_id'].value_counts()[data['offer_id'].value_counts() > 5].index
    data = data[data['offer_id'].isin(offer_ids)]
    uids = data['uid'].value_counts()[data['uid'].value_counts() > 5].index
    data = data[data['uid'].isin(uids)]
print(data.shape)

# encode, start with 1 - условие SASRec
offer_encoder = {off: ind for ind, off in enumerate(data['offer_id'].unique())}
data['offer_id_enc'] = data['offer_id'].map(offer_encoder) + 1
uid_encoder = {uid: ind for ind, uid in enumerate(data['uid'].unique())}
data['uid_enc'] = data['uid'].map(uid_encoder) + 1

# sort by time and user id
data['timestamp'] = pd.to_datetime(data['timestamp'])
data.sort_values(by=['uid_enc', 'timestamp'], inplace=True)
print(data.shape)
data.head(5)

(47998759, 8)
(47998759, 10)


Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd,offer_id_enc,uid_enc
27178479,2022-07-11 16:33:51.215,ed5a74a0255e4554,82662998,android,OpenOfferScreen,SearchResultsList,272458509,2022-07-11,92297,1
2667250,2022-07-11 16:35:28.448,cc6ee700d12b4996,82662998,android,OpenOfferScreen,SearchResultsList,269087803,2022-07-11,19263,1
2890965,2022-07-11 16:37:37.318,46eb5561d012447f,82662998,android,OpenOfferScreen,SearchResultsList,275502080,2022-07-11,86722,1
6670181,2022-07-11 16:55:04.947,5bf9a13d64fb4773,82662998,android,OpenOfferScreen,SearchResultsList,275170855,2022-07-11,1936,1
4294839,2022-07-13 20:05:53.175,7b60cb84c9634378,82662998,android,OpenOfferScreen,Undefined,275743706,2022-07-13,424152,1


In [18]:
# для параметра модели
len(set(data['offer_id_enc']))  

1252955

In [16]:
# create .txt file for input to model
data[['offer_id_enc', 'uid_enc']].to_csv('out.txt', sep='\t', header=False, index=False)

## Для модели SASRec с эмбеддингами

In [20]:
data.drop(['offer_id_enc', 'uid_enc'], axis=1, inplace=True)

In [21]:
embeddings = np.load('embeddings_BIG.npy')

In [22]:
def filter_embed(embeds, offers):
    """
        Filter embedding file, leaves only those that are in the dataset
        INPUT: embeds, offers - numpy arrays (of lists)
        OUTPUT: modified dataset
    """
    emb_id = embeds[:, 0].astype(int)
    mask = np.isin(emb_id, offers).reshape((embeds.shape[0], 1))
    embeds = np.hstack((embeds, mask))
    embeds = embeds[embeds[:, -1] != 0]
    return embeds[:, :-1]
offers = list(set(data['offer_id']))
embeddings = filter_embed(embeddings, offers)

In [23]:
# encode
offer_encoder = dict(zip(embeddings[:, 0], list(range(1, embeddings.shape[0] + 1))))
data['offer_id_enc'] = data['offer_id'].map(offer_encoder)
uid_encoder = {uid: ind for ind, uid in enumerate(data['uid'].unique())}
data['uid_enc'] = data['uid'].map(uid_encoder) + 1
# sort by time and user id
data['timestamp'] = pd.to_datetime(data['timestamp'])
data.sort_values(by=['uid_enc', 'timestamp'], inplace=True)
print(data.shape)
data.head(5)

(47998759, 10)


Unnamed: 0,timestamp,hit_id,uid,platform,event_name,screen,offer_id,ptn_dadd,offer_id_enc,uid_enc
27178479,2022-07-11 16:33:51.215,ed5a74a0255e4554,82662998,android,OpenOfferScreen,SearchResultsList,272458509,2022-07-11,1035779,1
2667250,2022-07-11 16:35:28.448,cc6ee700d12b4996,82662998,android,OpenOfferScreen,SearchResultsList,269087803,2022-07-11,872827,1
2890965,2022-07-11 16:37:37.318,46eb5561d012447f,82662998,android,OpenOfferScreen,SearchResultsList,275502080,2022-07-11,739787,1
6670181,2022-07-11 16:55:04.947,5bf9a13d64fb4773,82662998,android,OpenOfferScreen,SearchResultsList,275170855,2022-07-11,169438,1
4294839,2022-07-13 20:05:53.175,7b60cb84c9634378,82662998,android,OpenOfferScreen,Undefined,275743706,2022-07-13,616483,1


In [25]:
sorted(list(set(data['offer_id_enc'])))[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [26]:
# create .txt file for input to model
data[['offer_id_enc', 'uid_enc']].to_csv('out_e.txt', sep='\t', header=False, index=False)

In [27]:
# save embeddings file
np.save('embeddings_filtered_for_BIG.npy', embeddings)  