In [1]:
import utils
import pandas as pd

In [2]:
import torch
from torch import nn
from transformers import AutoTokenizer
from transformers import AutoModel,AutoConfig

In [3]:
pd.set_option("display.max_columns",None)

import warnings
warnings.filterwarnings('ignore')

In [4]:
pretrained_model_name = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, add_prefix_space=False)

In [5]:
sep_token = tokenizer.sep_token

### 1) Read data

In [7]:
df_agg_dataset = utils.open_object("./artifacts/df_agg_dataset.pkl")
numeric_scaler = utils.open_object("artifacts/numeric_scaler.pkl")
numeric_features = list(numeric_scaler.feature_names_in_)
category_value_map_dict = utils.open_object("./artifacts/col_value_to_index_dict.pkl")
catergory_features = list(category_value_map_dict)
text_features = ['sri_des']
df_series = utils.open_object("./artifacts/series_table.pkl")
series_features = set(list(df_series.columns))


### 2) Dataset Preparation

In [8]:
from torch.utils.data import Dataset, DataLoader

In [9]:
class ViewDataSet(Dataset):

    def __init__(self, df_agg_dataset):
        self.df_agg_dataset = df_agg_dataset
        self.len = len(df_agg_dataset)
        self.non_text_features_label = numeric_features + catergory_features + ['label']
        
    def __len__(self):
        return self.len

    def __getitem__(self, index):
        
        index = [index]
        
        data_item = self.df_agg_dataset.iloc[index]    
        
        tokenized_inputs = tokenizer(
            text = data_item['next_sri_des'].tolist(),
            text_pair = data_item['hist_sri_des'].tolist(),
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        
        features_dict = data_item[self.non_text_features_label].to_dict("list")
        features_dict.update(tokenized_inputs)
        features_dict = {k:torch.squeeze(torch.tensor(v)) for k,v in features_dict.items()}
         
        return features_dict

In [10]:
view_dataset = ViewDataSet(df_agg_dataset)

In [11]:
example = view_dataset[2]

In [12]:
view_dataset_loader = DataLoader(view_dataset, batch_size=18, shuffle=True)

In [13]:
for inputs in view_dataset_loader:
    break

### 3) Model

In [14]:
# config
model_config = AutoConfig.from_pretrained(pretrained_model_name)
model_config.num_lables = 2
model_config.add_pooling_layer = False
model_config.embedding_size = 4
model_config.series_embedding_size = 16
model_config.target_feature = 'product_series_cms_id' 
model_config.catergory_features = catergory_features
model_config.numeric_features = numeric_features
model_config.series_features = series_features
model_config.bert_output_size = 32
model_config.hidden_sizes = [201,128,64,32]
model_config.dropout = 0.1 

In [15]:
class MLP(nn.Module):
    def __init__(self,hidden_sizes,dropout = 0.1) -> None:
        super().__init__()
        
        mlp_list = []
        for i in range(len(hidden_sizes)-1):
            mlp_list.append(nn.Linear(in_features=hidden_sizes[i],out_features=hidden_sizes[i+1],bias=True))
            mlp_list.append(nn.LeakyReLU())
            mlp_list.append(nn.Dropout(p=dropout))
            
        self.mlp = nn.Sequential(*mlp_list)
    
    def forward(self,x):
        return self.mlp(x)
        

In [16]:
class VideoRecommender(nn.Module):
    def __init__(self,model_config):
        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_config  = model_config
        
        # bert
        self.bert = AutoModel.from_config(model_config)
        self.bert_linear = nn.Linear(self.model_config.hidden_size,self.model_config.bert_output_size, bias=False)
        
        # category embedding
        self.feature_embedding_dict = nn.ModuleDict()
        for feature in catergory_features:
            if feature in self.model_config.series_features:
                category_embeddings = nn.Embedding(len(category_value_map_dict[feature]), self.model_config.series_embedding_size)
            else:
                category_embeddings = nn.Embedding(len(category_value_map_dict[feature]), self.model_config.embedding_size)
                
            category_embeddings.weight.data.uniform_(-0.5,-0.5)
            self.feature_embedding_dict[feature] =category_embeddings
            
        # mlp
        self.mlp = MLP(self.model_config.hidden_sizes,dropout=self.model_config.dropout)
        
        self.ranker = nn.Linear(in_features = self.model_config.hidden_sizes[-1],
                                out_features  = 1,bias=True)

    def mean_pool_concat_embedding(self,embeddings_value):
        embeddings_hist_value = embeddings_value[:,:-1,:]
        embeddings_next_value = embeddings_hist_value[:,-1,:]
        embeddings_hist_mean_value = torch.mean(embeddings_hist_value,dim = 1)
        embeddings_output = torch.concat(
            (embeddings_hist_mean_value,embeddings_next_value),dim=1) 

        return embeddings_output
        
    def forward(self,inputs):
        
        # bert
        bert_encode = self.bert(input_ids=inputs['input_ids'],
                                token_type_ids=inputs['token_type_ids'],
                                attention_mask=inputs['attention_mask'])
        bert_encode = bert_encode.last_hidden_state[:, 0]
        bert_encode = self.bert_linear(bert_encode)
        

        # embedding
        embedding_tensors_list = []
        for feature in self.model_config.catergory_features:
            embedding_ids = inputs[feature]
            
            embedding_tensors = self.feature_embedding_dict[feature](embedding_ids)
            if feature in self.model_config.series_features:
                embedding_tensors = self.mean_pool_concat_embedding(embedding_tensors)
            else:
                embedding_tensors = torch.mean(embedding_tensors,dim = 1)
                
            embedding_tensors_list.append(embedding_tensors)

        embedding_encode = torch.concat(embedding_tensors_list,dim=1)
        
        #numeric
        numeric_tensors_list = []
        for feature in self.model_config.numeric_features:
            tensors  = inputs[feature].view(-1,1)
            numeric_tensors_list.append(tensors)

        numeric_encode = torch.concat(numeric_tensors_list,dim=1)
        
        all_features_encode = torch.concat(
            [bert_encode,embedding_encode,numeric_encode],
            dim=1)
        
        all_features_encode = self.mlp(all_features_encode)
        
        scores = self.ranker(all_features_encode)
        
        scores = torch.sigmoid(scores)
        
        return scores
        
        
        
        
        
        
        
        
        
                


In [17]:
video_recommender = VideoRecommender(model_config)

In [18]:
video_recommender

VedioRecommender(
  (bert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 4, padding_idx=0)
      (position_embeddings): Embedding(512, 4)
      (token_type_embeddings): Embedding(2, 4)
      (LayerNorm): LayerNorm((4,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=4, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
               

In [19]:
outputs = video_recommender(inputs)

In [20]:
outputs

tensor([[0.5245],
        [0.5255],
        [0.5264],
        [0.5246],
        [0.5236],
        [0.5269],
        [0.5276],
        [0.5268],
        [0.5236],
        [0.5278],
        [0.5280],
        [0.5237],
        [0.5240],
        [0.5290],
        [0.5235],
        [0.5249],
        [0.5204],
        [0.5219]], grad_fn=<SigmoidBackward0>)