In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import save_object,open_object
import random
from sklearn.utils import shuffle

In [2]:
pd.set_option("display.max_columns",None)

In [3]:
import warnings
warnings.filterwarnings('ignore')

### 1) read data

In [4]:
df = pd.read_csv("data/202108_test.csv")
len(df)

269272

In [5]:
len(set(df['user_id']))

1766

In [6]:
set(df['country'])

{'Singapore'}

### 2) Select Columns

In [7]:
series_features = ['product_cat_name','product_lang_name','product_group_series_name','product_series_cms_id']

In [8]:
num_feature_cols = ['episode_duration']
date_cols = ['device_first_visit_date','video_start_timestamp_hkt','video_end_timestamp_hkt','user_create_date']

In [9]:
text_cols = ['eps_des','sri_des']

In [10]:
catg_feature_col = ['platform_name','user_type','subscription_source',
               'plan_platform','resolution','subtitle','screen_mode',
               'device_network_mode','video_streaming_mode','cp_name'] + series_features

In [11]:
selected_cols = ['user_id'] + num_feature_cols + date_cols + text_cols + catg_feature_col

In [12]:
df = df[selected_cols]

In [13]:
df.head()

Unnamed: 0,user_id,episode_duration,device_first_visit_date,video_start_timestamp_hkt,video_end_timestamp_hkt,user_create_date,eps_des,sri_des,platform_name,user_type,subscription_source,plan_platform,resolution,subtitle,screen_mode,device_network_mode,video_streaming_mode,cp_name,product_cat_name,product_lang_name,product_group_series_name,product_series_cms_id
0,0ee9a5dcdfb58211cabaa69056abb395,4345,2016-11-26,2021-06-02 01:14:42,2021-06-02 01:14:57,2019-09-18 00:00:00,Young Jae says her farewell to Joon Young and ...,Love may be found from something very simple. ...,IOS Tablet,Premium,,,240p,English,portrait,Non-mobile,remote,jtbc,Korean Dramas,English,The Third Charm,19809
1,43d65e495417ac26a3cb7e47f3f3fed8,1507,2018-01-03,2021-05-26 22:37:39,2021-05-26 23:04:31,2017-11-21 00:00:00,Sabina accuses Dong Joo of stealing her toothb...,Four people from very different backgrounds wi...,IOS Tablet,Free,,,480p,English,landscape,Non-mobile,remote,MBC,Korean Dramas,English,Golden Garden,29142
2,4571e3c70557e41bc4a50d43b7755a37,1646,2019-11-19,2021-05-21 15:47:21,2021-05-21 16:10:41,2020-06-03 00:00:00,It's the anniversary of Pil Seung's parents' d...,Four people from very different backgrounds wi...,IOS Tablet,Free,,,480p,English,portrait,Non-mobile,remote,MBC,Korean Dramas,English,Golden Garden,29142
3,6bb61e3b7bce0931da574d19d1d82c88,2837,2021-06-05,2021-07-29 12:47:37,2021-07-29 12:55:35,,"Raised in a brothel, Wei Xiao Bao is quite a t...",This story revolves the adventure of Wei Xiao ...,Android Mobile,Anonymous,,,480p,English,portrait,Non-mobile,remote,Cathay Media Group,Chinese Dramas,English,Royal Tramp,32127
4,cb75cdaaba40d64c818722a857f01d0f,1620,2020-06-09,2021-06-03 23:17:34,2021-06-03 23:25:23,2018-06-24 00:00:00,Nam Hee has nightmares about losing her vision...,Four people from very different backgrounds wi...,IOS Tablet,Premium,IOS IAP,iPhone,480p,English,portrait,Non-mobile,remote,MBC,Korean Dramas,English,Golden Garden,29142


### 3) Preprocessing

In [14]:
df[catg_feature_col] = df[catg_feature_col].fillna('Unknown')

In [15]:
df["episode_duration"] = df["episode_duration"].fillna(int(np.mean(df["episode_duration"])))

In [16]:
today = pd.to_datetime("2022-08-01")

In [17]:
df['device_first_visit_date'] = pd.to_datetime(df['device_first_visit_date'])

In [18]:
df['device_first_visit_age'] = df['device_first_visit_date'].apply(lambda x: (today-x).days)

In [19]:
df['user_create_date'] = pd.to_datetime(df['user_create_date'])

In [20]:
df['user_age'] = df['user_create_date'].apply(lambda x: (today-x).days)

In [21]:
df['video_start_timestamp_hkt'] = pd.to_datetime(df['video_start_timestamp_hkt'])

In [22]:
df['video_start_hour'] = df['video_start_timestamp_hkt'].apply(lambda x: x.hour)

In [23]:
df['video_end_timestamp_hkt'] = pd.to_datetime(df['video_end_timestamp_hkt'])

In [24]:
df['video_end_hour'] = df['video_end_timestamp_hkt'].apply(lambda x: x.hour)

In [25]:
for col in df.columns:
    is_null = df[df[col].isnull()]
    if len(is_null)>0:
        print(col)

user_create_date
eps_des
sri_des
user_age


In [26]:
df['eps_des'] = df['eps_des'].fillna("No episode description")

In [27]:
df['sri_des'] = df['sri_des'].fillna("No Series description")

In [28]:
df['user_age']  = df['user_age'].fillna(int(np.mean(df['user_age']))) 

In [29]:
for col in df.columns:
    is_null = df[df[col].isnull()]
    if len(is_null)>0:
        print(col)

user_create_date


In [30]:
df.head()

Unnamed: 0,user_id,episode_duration,device_first_visit_date,video_start_timestamp_hkt,video_end_timestamp_hkt,user_create_date,eps_des,sri_des,platform_name,user_type,subscription_source,plan_platform,resolution,subtitle,screen_mode,device_network_mode,video_streaming_mode,cp_name,product_cat_name,product_lang_name,product_group_series_name,product_series_cms_id,device_first_visit_age,user_age,video_start_hour,video_end_hour
0,0ee9a5dcdfb58211cabaa69056abb395,4345,2016-11-26,2021-06-02 01:14:42,2021-06-02 01:14:57,2019-09-18,Young Jae says her farewell to Joon Young and ...,Love may be found from something very simple. ...,IOS Tablet,Premium,Unknown,Unknown,240p,English,portrait,Non-mobile,remote,jtbc,Korean Dramas,English,The Third Charm,19809,2074,1048.0,1,1
1,43d65e495417ac26a3cb7e47f3f3fed8,1507,2018-01-03,2021-05-26 22:37:39,2021-05-26 23:04:31,2017-11-21,Sabina accuses Dong Joo of stealing her toothb...,Four people from very different backgrounds wi...,IOS Tablet,Free,Unknown,Unknown,480p,English,landscape,Non-mobile,remote,MBC,Korean Dramas,English,Golden Garden,29142,1671,1714.0,22,23
2,4571e3c70557e41bc4a50d43b7755a37,1646,2019-11-19,2021-05-21 15:47:21,2021-05-21 16:10:41,2020-06-03,It's the anniversary of Pil Seung's parents' d...,Four people from very different backgrounds wi...,IOS Tablet,Free,Unknown,Unknown,480p,English,portrait,Non-mobile,remote,MBC,Korean Dramas,English,Golden Garden,29142,986,789.0,15,16
3,6bb61e3b7bce0931da574d19d1d82c88,2837,2021-06-05,2021-07-29 12:47:37,2021-07-29 12:55:35,NaT,"Raised in a brothel, Wei Xiao Bao is quite a t...",This story revolves the adventure of Wei Xiao ...,Android Mobile,Anonymous,Unknown,Unknown,480p,English,portrait,Non-mobile,remote,Cathay Media Group,Chinese Dramas,English,Royal Tramp,32127,422,1460.0,12,12
4,cb75cdaaba40d64c818722a857f01d0f,1620,2020-06-09,2021-06-03 23:17:34,2021-06-03 23:25:23,2018-06-24,Nam Hee has nightmares about losing her vision...,Four people from very different backgrounds wi...,IOS Tablet,Premium,IOS IAP,iPhone,480p,English,portrait,Non-mobile,remote,MBC,Korean Dramas,English,Golden Garden,29142,783,1499.0,23,23


### 4)  Features engineering

### numeric features

In [31]:
numeric_features = ['episode_duration','device_first_visit_age','user_age','video_start_hour','video_end_hour']

In [32]:
# import matplotlib.pyplot as plt

In [33]:
from sklearn.preprocessing import MinMaxScaler

In [34]:
scaler = MinMaxScaler()

In [35]:
scaler.fit(df[numeric_features])

In [36]:
df[numeric_features] = scaler.transform(df[numeric_features])

In [37]:
save_object("artifacts/numeric_scaler.pkl", scaler)

In [38]:
df['sri_des'] = df.apply(lambda x:f"{x['product_group_series_name']}: "+x['sri_des'],axis = 1) 

### category features

In [39]:
catg_feature_col  = ['platform_name','user_type','subscription_source','plan_platform','resolution',
                    'subtitle','screen_mode','device_network_mode','video_streaming_mode','cp_name',
                    "product_cat_name","product_lang_name","product_series_cms_id"]

In [40]:
col_value_to_index_dict = {}
for col in catg_feature_col:
    unique_value = set(df[col])
    value_dict = {val:index for (index,val) in enumerate(unique_value)}
    col_value_to_index_dict[col] = value_dict
    df[col] = df[col].map(value_dict)

In [41]:
save_object("./artifacts/col_value_to_index_dict.pkl",col_value_to_index_dict)

In [42]:
df_series = df[['product_series_cms_id','cp_name','product_lang_name','product_cat_name','sri_des']]

In [43]:
df_series = df_series.drop_duplicates()

In [44]:
df_series = df_series.reset_index(drop=True)

In [45]:
save_object("./artifacts/series_table.pkl",df_series)

### 6) Select Col Again

In [46]:
df = df[['user_id',
 'episode_duration',
 'video_start_timestamp_hkt',
 'sri_des',
 'platform_name',
 'user_type',
 'subscription_source',
 'plan_platform',
 'resolution',
 'subtitle',
 'screen_mode',
 'device_network_mode',
 'video_streaming_mode',
 'cp_name',
 'product_cat_name',
 'product_lang_name',
 'product_series_cms_id',
 'device_first_visit_age',
 'user_age',
 'video_start_hour',
 'video_end_hour']]

In [47]:
# example = df.head(6).copy()

# example.iloc[5]

In [48]:


# last_view = example.iloc[5].to_dict()

# negative_view = df_series.iloc[782].to_dict()

# last_view.update(negative_view)

# example.iloc[5]=last_view

# example.iloc[5]

In [49]:
save_object("./artifacts/features_table.pkl",df)

### 7)  Prepration training data

In [50]:
num_negative = 5 # negative sampling
window_size = 6

In [51]:
def get_user_historical_veiw_sequence(df_user,window_size = 6):
    
    n_slide = len(df_user)-window_size+1
    
    df_list = []
    
    df_user = df_user.sort_values(by='video_start_timestamp_hkt')
    
    sequence_id = 0
    
    for slide in range(n_slide):
        df_positive = df_user.iloc[slide:slide+window_size].copy()
        df_positive['sequence_id'] = sequence_id
        df_positive['label'] = 1
        
        future_series_ids = set(df_user[slide+window_size-1:]['product_series_cms_id'])
        
        # negative sampling
        negative_series_ids= np.random.choice(list(df_series.index),num_negative)
        
        for neg_id in negative_series_ids:
            if neg_id not in future_series_ids:
                
                sequence_id+=1
                
                negative_view = df_series.iloc[neg_id].to_dict()
                
                df_negative = df_positive.copy()
                df_negative['sequence_id'] = sequence_id
                
                last_view = df_negative.iloc[window_size-1].to_dict()
                last_view.update(negative_view)
                
                df_negative.iloc[window_size-1]=last_view
                df_negative['label'] = 0
                df_list.append(df_negative)
                   
        sequence_id+=1
        
        df_list.append(df_positive)
    # due to limit of memory    
    sample_ratio = 0.3
    sample_size = int(sample_ratio * len(df_list)) + 1
    df_sample_list = random.sample(df_list,sample_size)
    return pd.concat(df_sample_list)

In [52]:
user_list = shuffle(list(set(df['user_id'])))

In [53]:
sample_size = 0.5
sample_size = int(sample_size * len(user_list))
user_list = user_list[:sample_size]

In [54]:
# due to the out of memeory, we limit the size of each user view history
df_list = []
for user_id in tqdm(user_list):
    df_user = df[df['user_id']==user_id]
    if len(df_user)>=window_size:
        df_user = get_user_historical_veiw_sequence(df_user,window_size)
        df_list.append(df_user)

100%|██████████| 883/883 [12:48<00:00,  1.15it/s]


In [58]:
df_dataset = pd.concat(df_list)

In [59]:
save_object("./artifacts/df_dataset.pkl",df_dataset)

In [60]:
len(df_dataset)

1168680

In [61]:
from collections import Counter

In [62]:
Counter(df_dataset['label'])

Counter({0: 973758, 1: 194922})