In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import swifter
from sklearn.utils import shuffle

In [2]:
import utils

In [3]:
import torch
from torch import nn
from transformers import AutoTokenizer
from transformers import AutoModel,AutoConfig

In [4]:
pd.set_option("display.max_columns",None)

import warnings
warnings.filterwarnings('ignore')

In [5]:
# pretrained_model_name = "albert-base-v2"
# tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, add_prefix_space=False)

In [6]:
# sep_token = tokenizer.sep_token

### 1) read data

In [7]:
df_dataset = utils.open_object("./artifacts/df_dataset.pkl")

In [8]:
len(df_dataset)

2671872

In [9]:
# test_size = 10000
# # df_dataset = shuffle(df_dataset).head(test_size)
# df_dataset = df_dataset.head(test_size)

In [10]:
# df_dataset.head(20)

In [11]:
df_series = utils.open_object("./artifacts/series_table.pkl")

In [12]:
df_series.head()

Unnamed: 0,product_series_cms_id,cp_name,product_lang_name,product_cat_name,sri_des
0,754,19,0,27,The Third Charm: Love may be found from someth...
1,939,13,0,27,Golden Garden: Four people from very different...
2,1389,64,0,31,Royal Tramp: This story revolves the adventure...
3,1225,11,0,8,Along with The Gods: The Last 49 Days: The thr...
4,1051,62,0,37,Another Miss Oh (Tagalog): May dalawang Oh Hae...


### 2)  Agg historical view features

In [13]:
example = df_dataset.head()

In [14]:
series_features = set(list(df_series.columns))

In [15]:
series_features

{'cp_name',
 'product_cat_name',
 'product_lang_name',
 'product_series_cms_id',
 'sri_des'}

In [16]:
# example.groupby(group_key)

In [17]:
numeric_scaler = utils.open_object("artifacts/numeric_scaler.pkl")

In [18]:
numeric_features = list(numeric_scaler.feature_names_in_)

In [19]:
category_value_map_dict = utils.open_object("./artifacts/col_value_to_index_dict.pkl")

In [20]:
catergory_features = list(category_value_map_dict)

In [21]:
# text_features = ['eps_des','sri_des']
text_features = ['sri_des']

In [22]:
agg_text_feature_dict  = example[text_features].to_dict("list")

In [23]:
# len(list(agg_text_feature_dict['eps_des']))

In [24]:
def process_text(agg_text_feature_dict):
    agg_join_text_feature_dict = {}
    for field in agg_text_feature_dict:
        list_text = agg_text_feature_dict[field]
        
        hist_text = list_text[:-1]
        next_text = list_text[-1]
        
        hist_text_unique = []
        
        for text in hist_text:
            if text not in hist_text_unique:
                hist_text_unique.append(text)
        
        num_tokens = 512//(len(hist_text_unique)+1)
        
        hist_text = [" ".join(text.split(" ")[:num_tokens]) for text in hist_text_unique]
        next_text = " ".join(next_text.split(" ")[:num_tokens])
        hist_text = " ".join([f"{field} {index+1}: " + text for index,text in enumerate(hist_text)])
                                        
        agg_join_text_feature_dict["next_"+field] = next_text
        agg_join_text_feature_dict["hist_"+field] = hist_text
    return agg_join_text_feature_dict

In [25]:
def get_aggregated_feature_dict(example):
    agg_numeric_feature_dict = np.mean(example[numeric_features]).to_dict()

    agg_catergory_feature_dict = example[catergory_features].to_dict(orient="list")
    agg_catergory_feature_dict = {k:v[:-1] if k not in series_features else v for (k,v) in agg_catergory_feature_dict.items()}
    
    
    agg_text_feature_dict = example[text_features].to_dict(orient="list")
    agg_text_feature_dict = process_text(agg_text_feature_dict)

    all_agg_feature_dict = {}

    all_agg_feature_dict.update(agg_numeric_feature_dict)
    all_agg_feature_dict.update(agg_catergory_feature_dict)
    all_agg_feature_dict.update(agg_text_feature_dict)

    return all_agg_feature_dict

In [26]:
group_key = ['user_id','sequence_id','label']

In [27]:
dataset_dict_list = []
for (user_id,seq_id,label),view_history in tqdm(df_dataset.groupby(group_key)):
    aggregated_feature_dict = get_aggregated_feature_dict(view_history)
    aggregated_feature_dict['user_id'] = user_id
    aggregated_feature_dict['sequence_id']= seq_id
    aggregated_feature_dict['label'] = label
    dataset_dict_list.append(aggregated_feature_dict)
#     break

100%|██████████| 445312/445312 [05:41<00:00, 1303.12it/s]


In [28]:
# %timeit df_agg_dataset_tmp = df_dataset.groupby(group_key).apply(lambda x:get_aggregated_feature_dict(x)).reset_index()

In [33]:
df_agg_dataset.head()

Unnamed: 0,episode_duration,device_first_visit_age,user_age,video_start_hour,video_end_hour,platform_name,user_type,subscription_source,plan_platform,resolution,subtitle,screen_mode,device_network_mode,video_streaming_mode,cp_name,product_cat_name,product_lang_name,product_series_cms_id,next_sri_des,hist_sri_des,user_id,sequence_id,label
0,0.459936,0.15625,0.991258,0.550725,0.572464,"[3, 3, 3, 3, 3]","[0, 0, 0, 0, 0]","[2, 2, 2, 2, 2]","[7, 7, 7, 7, 7]","[1, 1, 1, 1, 1]","[6, 6, 6, 6, 6]","[2, 1, 1, 2, 2]","[1, 1, 1, 1, 1]","[0, 0, 0, 2, 2]","[62, 62, 62, 43, 43, 36]","[37, 37, 37, 25, 25, 25]","[2, 2, 2, 2, 2, 2]","[508, 508, 508, 1094, 169, 984]",Beyond Evil: Police inspectors are often the m...,sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,6,1
1,0.430993,0.15625,0.991258,0.565217,0.57971,"[3, 3, 3, 3, 3]","[0, 0, 0, 0, 0]","[2, 2, 2, 2, 2]","[7, 7, 7, 7, 7]","[1, 1, 1, 1, 1]","[6, 6, 6, 6, 6]","[1, 1, 2, 2, 2]","[1, 1, 1, 1, 1]","[0, 0, 2, 2, 2]","[62, 62, 43, 43, 36, 49]","[37, 37, 25, 25, 25, 39]","[2, 2, 2, 2, 2, 2]","[508, 508, 1094, 169, 984, 176]",House Hunters Asia S1: House Hunters Asia brin...,sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,15,0
2,0.430993,0.15625,0.991258,0.565217,0.57971,"[3, 3, 3, 3, 3]","[0, 0, 0, 0, 0]","[2, 2, 2, 2, 2]","[7, 7, 7, 7, 7]","[1, 1, 1, 1, 1]","[6, 6, 6, 6, 6]","[1, 1, 2, 2, 2]","[1, 1, 1, 1, 1]","[0, 0, 2, 2, 2]","[62, 62, 43, 43, 36, 62]","[37, 37, 25, 25, 25, 25]","[2, 2, 2, 2, 2, 2]","[508, 508, 1094, 169, 984, 19]",Angel's Last Mission-Love: People just fall in...,sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,17,0
3,0.403273,0.15625,0.991258,0.586957,0.601449,"[3, 3, 3, 3, 3]","[0, 0, 0, 0, 0]","[2, 2, 2, 2, 2]","[7, 7, 7, 7, 7]","[1, 1, 1, 1, 1]","[6, 6, 6, 6, 6]","[1, 2, 2, 2, 2]","[1, 1, 1, 1, 1]","[0, 2, 2, 2, 2]","[62, 43, 43, 36, 36, 36]","[37, 25, 25, 25, 25, 25]","[2, 2, 2, 2, 2, 2]","[508, 1094, 169, 984, 862, 862]",18 Again: Jung Da Jung was once a girl who was...,sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,18,1
4,0.403273,0.15625,0.991258,0.586957,0.601449,"[3, 3, 3, 3, 3]","[0, 0, 0, 0, 0]","[2, 2, 2, 2, 2]","[7, 7, 7, 7, 7]","[1, 1, 1, 1, 1]","[6, 6, 6, 6, 6]","[1, 2, 2, 2, 2]","[1, 1, 1, 1, 1]","[0, 2, 2, 2, 2]","[62, 43, 43, 36, 36, 24]","[37, 25, 25, 25, 25, 25]","[2, 2, 2, 2, 2, 2]","[508, 1094, 169, 984, 862, 1002]",Kairos: Living a precarious life as a part-tim...,sri_des 1: The Return of Superman (2021): The ...,00189c7eddbe8fa8b0eb6cb6d27d4ee0,22,0


In [35]:
dict_agg_dataset = df_agg_dataset.to_dict("list")

In [29]:
df_agg_dataset = pd.DataFrame(dataset_dict_list)

In [30]:
utils.save_object("./artifacts/df_agg_dataset.pkl",df_agg_dataset)

In [37]:
utils.save_object("./artifacts/dataset_dict_list.pkl",dataset_dict_list)

In [31]:
!pwd

/Users/nlin/Desktop/Codes/local/VedioRecommender


In [32]:
pd.__version__

'1.4.3'