In [1]:
import utils
import pandas as pd

In [2]:
import torch
from torch import nn
from transformers import AutoTokenizer
from transformers import AutoModel,AutoConfig

In [3]:
pd.set_option("display.max_columns",None)

import warnings
warnings.filterwarnings('ignore')

In [4]:
pretrained_model_name = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, add_prefix_space=False)

In [5]:
sep_token = tokenizer.sep_token

### 1) read data

In [6]:
df_agg_dataset = utils.open_object("./artifacts/df_agg_dataset.pkl")

In [7]:
numeric_scaler = utils.open_object("artifacts/numeric_scaler.pkl")

In [8]:
numeric_features = list(numeric_scaler.feature_names_in_)

In [9]:
category_value_map_dict = utils.open_object("./artifacts/col_value_to_index_dict.pkl")

In [10]:
catergory_features = list(category_value_map_dict)

In [11]:
text_features = ['sri_des']

In [12]:
df_series = utils.open_object("./artifacts/series_table.pkl")

In [13]:
series_features = set(list(df_series.columns))


### 3) Dataset Preparation


In [14]:
from torch.utils.data import Dataset, DataLoader

In [15]:
class ViewDataSet(Dataset):

    def __init__(self, df_agg_dataset):
        self.df_agg_dataset = df_agg_dataset
        self.len = len(df_agg_dataset)
        self.non_text_features_label = numeric_features + catergory_features + ['label']
        
    def __len__(self):
        return self.len

    def __getitem__(self, index):
        
        index = [index]
        
        data_item = self.df_agg_dataset.iloc[index]    
        
        tokenized_inputs = tokenizer(
            text = data_item['next_sri_des'].tolist(),
            text_pair = data_item['hist_sri_des'].tolist(),
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        
        features_dict = data_item[self.non_text_features_label].to_dict("list")
        features_dict.update(tokenized_inputs)
        features_dict = {k:torch.squeeze(torch.tensor(v)) for k,v in features_dict.items()}
         
        return features_dict

In [16]:
view_dataset = ViewDataSet(df_agg_dataset)

In [17]:
example = view_dataset[2]

In [18]:
# example

In [19]:
view_dataset_loader = DataLoader(view_dataset, batch_size=18, shuffle=True)

In [20]:
for inputs in view_dataset_loader:
    break

### 3) Model

In [21]:
# config
model_config = AutoConfig.from_pretrained(pretrained_model_name)
model_config.num_lables = 2
model_config.add_pooling_layer = False
model_config.embedding_size = 4
model_config.series_embedding_size = 16
model_config.target_feature = 'product_series_cms_id' 
model_config.catergory_features = catergory_features
model_config.numeric_features = numeric_features
model_config.series_features = series_features
model_config.bert_output_size = 32

In [22]:
class self:
    pass

In [23]:
self.model_config = model_config

In [24]:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# self.catergory_features = self.model_config.catergory_features
# self.numeric_features = self.model_config.numeric_features
# self.series_features = self.model_config.series_features

#### bert

In [25]:
self.bert = AutoModel.from_config(model_config,add_pooling_layer = self.model_config.add_pooling_layer)

In [26]:
bert_encode = self.bert(input_ids=inputs['input_ids'],token_type_ids=inputs['token_type_ids'],
          attention_mask=inputs['attention_mask'])

In [27]:
self.bert_linear = nn.Linear(self.model_config.hidden_size,self.model_config.bert_output_size, bias=False)

In [28]:
bert_encode = bert_encode.last_hidden_state[:, 0]

bert_encode = self.bert_linear(bert_encode)

bert_encode.shape

torch.Size([18, 32])

#### embedding

In [29]:
self.feature_embedding_dict = nn.ModuleDict()

for feature in catergory_features:
    if feature in self.model_config.series_features:
        category_embeddings = nn.Embedding(len(category_value_map_dict[feature]), self.model_config.series_embedding_size)
    else:
        category_embeddings = nn.Embedding(len(category_value_map_dict[feature]), self.model_config.embedding_size)
        
    category_embeddings.weight.data.uniform_(-0.5,-0.5)
    self.feature_embedding_dict[feature] =category_embeddings 

In [30]:
self.feature_embedding_dict

ModuleDict(
  (platform_name): Embedding(4, 4)
  (user_type): Embedding(3, 4)
  (subscription_source): Embedding(6, 4)
  (plan_platform): Embedding(38, 4)
  (resolution): Embedding(4, 4)
  (subtitle): Embedding(13, 4)
  (screen_mode): Embedding(3, 4)
  (device_network_mode): Embedding(3, 4)
  (video_streaming_mode): Embedding(3, 4)
  (cp_name): Embedding(65, 16)
  (product_cat_name): Embedding(40, 16)
  (product_lang_name): Embedding(3, 16)
  (product_series_cms_id): Embedding(1526, 16)
)

In [31]:
def mean_pool_concat_embedding(embeddings_value):
    embeddings_hist_value = embeddings_value[:,:-1,:]
    embeddings_next_value = embeddings_hist_value[:,-1,:]
    embeddings_hist_mean_value = torch.mean(embeddings_hist_value,dim = 1)
    embeddings_output = torch.concat(
        (embeddings_hist_mean_value,embeddings_next_value),dim=1) 

    return embeddings_output

In [32]:
embedding_tensors_list = []
for feature in self.model_config.catergory_features:
    embedding_ids = inputs[feature]
    
    embedding_tensors = self.feature_embedding_dict[feature](embedding_ids)
    if feature in self.model_config.series_features:
        embedding_tensors = mean_pool_concat_embedding(embedding_tensors)
    else:
        embedding_tensors = torch.mean(embedding_tensors,dim = 1)
        
    embedding_tensors_list.append(embedding_tensors)

embedding_features_tensors = torch.concat(embedding_tensors_list,dim=1)

In [33]:
embedding_features_tensors.shape

torch.Size([18, 164])

#### numeric

In [34]:
numeric_tensors_list = []
for feature in self.model_config.numeric_features:
    tensors  = inputs[feature].view(-1,1)
    numeric_tensors_list.append(tensors)

numeric_features_tesors = torch.concat(numeric_tensors_list,dim=1)

In [51]:
class MLP(nn.Module):
    def __init__(self,hidden_sizes,dropout = 0.1) -> None:
        super().__init__()
        
        
        mlp_list = []
        for i in range(len(hidden_sizes)-1):
            mlp_list.append(nn.Linear(in_features=hidden_sizes[i],out_features=hidden_sizes[i+1],bias=True))
            mlp_list.append(nn.LeakyReLU())
            mlp_list.append(nn.Dropout(p=dropout))
            
        self.mlp = nn.Sequential(*mlp_list)
    
    def forward(self,x):
        return self.mlp(x)
        

In [57]:
self.mlp  = MLP(hidden_sizes=[201,128,32,16,8])

In [55]:
all_features_input = torch.concat([bert_encode,embedding_features_tensors,numeric_features_tesors],dim=1)

In [56]:
all_features_input.shape

torch.Size([18, 201])

In [62]:
features_encode = self.mlp(all_features_input)

In [63]:
self.ranker = nn.Linear(in_features = 8,out_features  = 1,bias=True)

In [65]:
scores = self.ranker(features_encode)

In [68]:
torch.sigmoid(scores)

tensor([[0.4450],
        [0.4471],
        [0.4445],
        [0.4452],
        [0.4433],
        [0.4453],
        [0.4452],
        [0.4580],
        [0.4579],
        [0.4422],
        [0.4496],
        [0.4580],
        [0.4441],
        [0.4451],
        [0.4443],
        [0.4448],
        [0.4452],
        [0.4579]], grad_fn=<SigmoidBackward0>)

In [38]:
hidden_sizes = [201,64,32,8]

In [None]:
class 

In [None]:
self.bert_linear = nn.Linear(, bias=True)

In [None]:
self.

In [None]:
self.mlp = nn.ModuleList()

In [76]:
self.activation_fuct = nn.functional.gelu

In [None]:
class VedioRecommender(nn.Module):
    
    def __init__(self, model_config):
        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        self.catergory_features = model_config.catergory_features
        self.numeric_features = model_config.numeric_features
        
        self.bert = AutoModel.from_config(model_config,add_pooling_layer = model_config.add_pooling_layer)
        
        self.feature_embedding_dict = {}
        for feature in catergory_features:
            if feature in self.series_features:
                category_embeddings = nn.Embedding(len(category_value_map_dict[feature]), self.series_embedding_size)
            else:
                category_embeddings = nn.Embedding(len(category_value_map_dict[feature]), self.embedding_size)

            category_embeddings.weight.data.uniform_(-0.5,-0.5)
            self.feature_embedding_dict[feature] =category_embeddings 
        
        self.activation_fuct = nn.functional.gelu


    def mean_pool_concatembedding(self,embeddings_value):
        embeddings_hist_value = embeddings_value[:,:-1,:]
        embeddings_next_value = embeddings_hist_value[:,-1,:]
        embeddings_hist_mean_value = torch.mean(embeddings_hist_value,dim = 1)
        embeddings_output = torch.concat(
            (embeddings_hist_mean_value,embeddings_next_value),1) 

        return embeddings_output

    def forward(self,inputs):
        
        # numeric feature
        numeric_tensors_list = []
        for feature in self.numeric_features:
            tensors  = inputs[feature].view(-1,1)
            numeric_tensors_list.append(tensors)
            
        numeric_features_tesors = torch.concat(numeric_tensors,dim=1)
        
        
        # category embeedding features
        embedding_tensors_list = []
        for feature in self.catergory_features:
            embedding_ids = inputs[feature]
            embedding_tensors = self.feature_embedding_dict[feature](embedding_ids)
            if feature in self.series_features:
                embedding_tensors = mean_pool_concatembedding(embedding_tensors)
            else:
                embedding_tensors = torch.mean(embedding_tensors,dim = 1)
            embedding_tensors_list.append(embedding_tensors)
            
        embedding_features_tensors = torch.concat(embedding_tensors_list,dim=1)
        
        
        # text features
        bert_encode = self.bert(input_ids=inputs['input_ids'],token_type_ids=inputs['token_type_ids'],
                  attention_mask=inputs['attention_mask'])

        bert_encode = bert_encode.last_hidden_state[:, 0]
        bert_encode = self.bert_linear(bert_encode)
        
        
        all_features = pd.concat((numeric_features_tesors,embedding_features_tensors,bert_encode),dim=1)
        
        
        
        
        

In [None]:
d

In [164]:
platform_name_embeddings_layer = nn.Embedding(len(category_value_map_dict['platform_name']), model_config.embedding_size)

In [165]:
platform_name_embeddings_value = platform_name_embeddings_layer(platform_name)

In [166]:
def mean_pool_concatembedding(embeddings_value):
    embeddings_hist_value = embeddings_value[:,:-1,:]
    embeddings_next_value = embeddings_hist_value[:,-1,:]
    embeddings_hist_mean_value = torch.mean(embeddings_hist_value,dim = 1)
    embeddings_output = torch.concat(
        (embeddings_hist_mean_value,embeddings_next_value),1) 
    
    return embeddings_output

In [168]:
mean_pool_concat_embedding(platform_name_embeddings_value ).shape

torch.Size([18, 64])

In [159]:
platform_name_embeddings_target_value = platform_name_embeddings_value[:,-1,:]

In [160]:
platform_name_embeddings_target_value.shape

torch.Size([18, 32])

In [152]:
platform_name_embeddings_hist_value.shape

torch.Size([18, 5, 32])

In [154]:
platform_name_embeddings_hist_value = torch.mean(platform_name_embeddings_hist_value,dim = 1)

In [156]:
platform_name_embeddings_hist_value.shape

torch.Size([18, 32])

In [161]:
platform_name_embeddings_output = torch.concat(
    (platform_name_embeddings_hist_value,platform_name_embeddings_target_value),1) 

In [162]:
platform_name_embeddings_output.shape

torch.Size([18, 64])

In [None]:
class VedioRecommender(nn.Module):
    
    def __init__(self, model_config):
        super().__init__()
        
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        self.model_config = model_config
        self.bert = AutoModel.from_config(model_config,add_pooling_layer = model_config.add_pooling_layer)
        
        self.linear = nn.Linear(model_config.hidden_size,model_config.model_embedding_size,bias = False)
        
        self.user_type_embeddings = nn.Embedding(len(category_value_map_dict['user_type']), model_config.embedding_size)
        self.user_type_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.subscription_source_embeddings = nn.Embedding(len(category_value_map_dict['subscription_source']), model_config.embedding_size)
        self.subscription_source_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.plan_platform_embeddings = nn.Embedding(len(category_value_map_dict['plan_platform']), model_config.embedding_size)
        self.plan_platform_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.resolution_embeddings = nn.Embedding(len(category_value_map_dict['resolution']), model_config.embedding_size)
        self.resolution_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.screen_mode_embeddings = nn.Embedding(len(category_value_map_dict['screen_mode']), model_config.embedding_size)
        self.screen_mode_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.device_network_mode_embeddings = nn.Embedding(len(category_value_map_dict['device_network_mode']), model_config.embedding_size)
        self.device_network_mode_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.video_streaming_mode_embeddings = nn.Embedding(len(category_value_map_dict['video_streaming_mode']), model_config.embedding_size)
        self.video_streaming_mode_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.cp_name_embeddings = nn.Embedding(len(category_value_map_dict['cp_name']), model_config.embedding_size)
        self.cp_name_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.product_cat_name_embeddings = nn.Embedding(len(category_value_map_dict['product_cat_name']), model_config.embedding_size)
        self.user_type_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.product_lang_name_embeddings = nn.Embedding(len(category_value_map_dict['product_lang_name']), model_config.embedding_size)
        self.product_lang_name_embeddings.weight.data.uniform_(-0.5,-0.5)
        
        self.product_series_cms_id_embeddings = nn.Embedding(len(category_value_map_dict['product_series_cms_id']), model_config.embedding_size)
        self.product_series_cms_id_embeddings.weight.data.uniform_(-0.5,-0.5)


    def mean_pool_concatembedding(self,embeddings_value):
        embeddings_hist_value = embeddings_value[:,:-1,:]
        embeddings_next_value = embeddings_hist_value[:,-1,:]
        embeddings_hist_mean_value = torch.mean(embeddings_hist_value,dim = 1)
        embeddings_output = torch.concat(
            (embeddings_hist_mean_value,embeddings_next_value),1) 

        return embeddings_output

    def forward(self,inputs):
        
        #numeric features
        episode_duration  = inputs['episode_duration'].view(-1,1)
        device_first_visit_age = inputs['device_first_visit_age'].view(-1,1)
        user_age = inputs['user_age'].view(-1,1)
        video_start_hour = inputs['video_start_hour'].view(-1,1)
        video_end_hour = inputs['video_end_hour'].view(-1,1)
        
        
        numeric_features = torch.concat(
            (episode_duration,device_first_visit_age,user_age,video_start_hour,video_end_hour),
            dim=1)
        

        # embedding feature
        platform_name_inputs = inputs['platform_name']
        plan_platform_embeddings = self.plan_platform_embeddings(platform_name_inputs)
        plan_platform_embeddings = mean_pool_concatembedding(plan_platform_embeddings)
        
        
        
        
        
        
        
        
        
        
        
        
        encoder_outputs = self.bert(**inputs)

        sequence_output = encoder_outputs[0]

        last_hidden_state = sequence_output[:, 0]
        
        bert_encode = self.linear(last_hidden_state)
        
        logits = None
        total_loss= None
        pos_loss, neg_bert_loss = None, None
        
        if output_logit:
            logits = torch.matmul(bert_encode,self.embeddings.weight.transpose(0, 1))
        
        if labels is not None:
            batch_size = inputs['input_ids'].shape[0]
            
            negative_samples = self.take_negative_samples(self.num_lables,
                                  self.negative_sample_size,
                                  batch_size,
                                  self.sampling_prob_dist).to(self.device)
                
            
            neg_embed = self.embeddings(negative_samples) # torch.Size([18, 2048, 256])
            label_embed = self.embeddings(labels) # torch.Size([18, 256])
            
            # positive
            # torch.Size([18, 256]) * torch.Size([18, 256])
            pos_bert_logits = torch.diag(torch.matmul(label_embed, bert_encode.transpose(0, 1)))
            pos_bert_loss = -torch.log(1 / (1 + torch.exp(-pos_bert_logits))).mean()
            
            # negative
            # torch.Size([18, 2048, 256]) * torch.Size([18, 256])
            neg_bert_logits = torch.matmul(neg_embed, torch.unsqueeze(bert_encode, dim=-1)).squeeze()
            neg_bert_loss = -torch.log(1 / (1 + torch.exp(neg_bert_logits))).mean()

            total_loss = 5*pos_bert_loss + neg_bert_loss
                
        
        return ModelOutput(total_loss,pos_bert_loss,neg_bert_loss,bert_encode,logits)