In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import swifter
from sklearn.utils import shuffle

In [2]:
import utils

In [3]:
import torch
from torch import nn
from transformers import AutoTokenizer
from transformers import AutoModel,AutoConfig

In [4]:
pd.set_option("display.max_columns",None)

import warnings
warnings.filterwarnings('ignore')

In [5]:
pretrained_model_name = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, add_prefix_space=False)

In [6]:
sep_token = tokenizer.sep_token

### 1) read data

In [7]:
df_dataset = utils.open_object("./artifacts/df_dataset.pkl")

In [8]:
test_size = 10000
# df_dataset = shuffle(df_dataset).head(test_size)
df_dataset = df_dataset.head(test_size)

In [9]:
# df_dataset.head(20)

In [10]:
df_series = utils.open_object("./artifacts/series_table.pkl")

In [11]:
df_series.head()

Unnamed: 0,product_series_cms_id,cp_name,product_lang_name,product_cat_name,sri_des
0,754,36,2,25,The Third Charm: Love may be found from someth...
1,939,24,2,25,Golden Garden: Four people from very different...
2,1389,60,2,16,Royal Tramp: This story revolves the adventure...
3,1225,35,2,26,Along with The Gods: The Last 49 Days: The thr...
4,1051,43,2,9,Another Miss Oh (Tagalog): May dalawang Oh Hae...


### 2)  Agg historical view features

In [12]:
example = df_dataset.head()

In [13]:
series_features = set(list(df_series.columns))

In [14]:
series_features

{'cp_name',
 'product_cat_name',
 'product_lang_name',
 'product_series_cms_id',
 'sri_des'}

In [15]:
# example.groupby(group_key)

In [16]:
numeric_scaler = utils.open_object("artifacts/numeric_scaler.pkl")

In [17]:
numeric_features = list(numeric_scaler.feature_names_in_)

In [18]:
category_value_map_dict = utils.open_object("./artifacts/col_value_to_index_dict.pkl")

In [19]:
catergory_features = list(category_value_map_dict)

In [20]:
# text_features = ['eps_des','sri_des']
text_features = ['sri_des']

In [21]:
agg_text_feature_dict  = example[text_features].to_dict("list")

In [22]:
# len(list(agg_text_feature_dict['eps_des']))

In [23]:
def process_text(agg_text_feature_dict):
    agg_join_text_feature_dict = {}
    for field in agg_text_feature_dict:
        list_text = agg_text_feature_dict[field]
        
        hist_text = list_text[:-1]
        next_text = list_text[-1]
        
        hist_text_unique = []
        
        for text in hist_text:
            if text not in hist_text_unique:
                hist_text_unique.append(text)
        
        num_tokens = 512//(len(hist_text_unique)+1)
        
        hist_text = [" ".join(text.split(" ")[:num_tokens]) for text in hist_text_unique]
        next_text = " ".join(next_text.split(" ")[:num_tokens])
        hist_text = " ".join([f"{field} {index+1}: " + text for index,text in enumerate(hist_text)])
                                        
        agg_join_text_feature_dict["next_"+field] = next_text
        agg_join_text_feature_dict["hist_"+field] = hist_text
    return agg_join_text_feature_dict

In [24]:
def get_aggregated_feature_dict(example):
    agg_numeric_feature_dict = np.mean(example[numeric_features]).to_dict()

    agg_catergory_feature_dict = example[catergory_features].to_dict(orient="list")
    agg_catergory_feature_dict = {k:v[:-1] if k not in series_features else v for (k,v) in agg_catergory_feature_dict.items()}
    
    
    agg_text_feature_dict = example[text_features].to_dict(orient="list")
    agg_text_feature_dict = process_text(agg_text_feature_dict)

    all_agg_feature_dict = {}

    all_agg_feature_dict.update(agg_numeric_feature_dict)
    all_agg_feature_dict.update(agg_catergory_feature_dict)
    all_agg_feature_dict.update(agg_text_feature_dict)

    return all_agg_feature_dict

In [25]:
group_key = ['user_id','sequence_id','label']

In [26]:
dataset_dict_list = []
for (user_id,seq_id,label),view_history in tqdm(df_dataset.groupby(group_key)):
    aggregated_feature_dict = get_aggregated_feature_dict(view_history)
    aggregated_feature_dict['user_id'] = user_id
    aggregated_feature_dict['sequence_id']= seq_id
    aggregated_feature_dict['label'] = label
    dataset_dict_list.append(aggregated_feature_dict)
#     break

100%|██████████| 1667/1667 [00:01<00:00, 1220.81it/s]


In [27]:
# %timeit df_agg_dataset_tmp = df_dataset.groupby(group_key).apply(lambda x:get_aggregated_feature_dict(x)).reset_index()

In [28]:
df_agg_dataset = pd.DataFrame(dataset_dict_list)

In [29]:
utils.save_object("./artifacts/df_agg_dataset.pkl",df_agg_dataset)