In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from datetime import date
import re
import torch
import matplotlib.pyplot as plt
from pacmap import PaCMAP
from transformers import AutoTokenizer, AutoModel
import arxiv
from sklearn.neighbors import KNeighborsTransformer
from IPython.display import clear_output
from datetime import date
from datetime import timedelta
from pymongo import MongoClient
from datetime import datetime
from torch import nn
from database_utils import *
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
device = 'cpu'
torch.device(device)

device(type='cpu')

In [11]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [2]:
class ClickedDataset(Dataset):
    def __init__(self, abstracts, item_vectors,abstract_map):
        self.item_vectors = item_vectors.astype(np.float32)
        self.abstracts = abstracts
        self.abstract_map = abstract_map

    def __len__(self):
        return len(self.item_vectors)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        abstract = self.abstracts[self.abstract_map[idx]]
        item_vecs = self.item_vectors[idx]
        
        sample = (abstract,item_vecs)

        return sample    
    
class paperBERT(nn.Module):
    def __init__(self, latent_dim):
        super(paperBERT, self).__init__()
        self.longformer = AutoModel.from_pretrained('allenai/longformer-base-4096')
        self.tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
        self.linear1 = nn.Linear(768, 256)
        self.linear2 = nn.Linear(256, latent_dim)
        
    def forward(self,data):
        encoding = self.tokenizer.batch_encode_plus(data, return_tensors='pt', padding=True,
                                                       truncation=True, add_special_tokens = True).to(device)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        
        model_output = self.longformer(
               input_ids, 
               attention_mask=attention_mask)

        # sequence_output has the following shape: (batch_size, sequence_length, 768)
        sequence_output=  mean_pooling(model_output, attention_mask)
        print(sequence_output.shape)
        linear1_output = self.linear1(sequence_output.view(-1,768)) ## extract the 1st token's embeddings
        linear2_output = self.linear2(linear1_output)

        return linear2_output
    
    def train(self,data_loader,epochs):
        criterion = nn.MSELoss() ## If required define your own criterion
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))

        for epoch in range(epochs):
            for batch in data_loader: ## If you have a DataLoader()  object to get the data.
                
                data = list(batch[0])
                targets = batch[1] ## assuming that data loader returns a tuple of data and its targets
                
                optimizer.zero_grad()   
                outputs = model(data)
                outputs = torch.tanh(outputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
    
    @torch.no_grad()
    def predict(self, data):
        with torch.no_grad():
            encoding = self.tokenizer.batch_encode_plus(data, return_tensors='pt', padding=True,
                                                           truncation=True, add_special_tokens = True).to(device)
            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']

            model_output = self.longformer(
                   input_ids, 
                   attention_mask=attention_mask)

            # sequence_output has the following shape: (batch_size, sequence_length, 768)
            sequence_output=  mean_pooling(model_output, attention_mask)
            print(sequence_output.shape)
            linear1_output = self.linear1(sequence_output.view(-1,768)) ## extract the 1st token's embeddings
            linear2_output = self.linear2(linear1_output)
            outputs = torch.tanh(linear2_output)
        return outputs.numpy()

In [8]:
def loss(X, Y, Z, mask, lam):
    return np.sum( ((Y.T @ Z - X) ** 2)[mask] ) \
       + lam * np.linalg.norm(Y, ord="fro") \
       + lam * np.linalg.norm(Z, ord="fro")

def approximate(data_dense, rank, lam,max_iteration=1000,Y_start=None,Z_start=None):
    # initialize low-rank approximation matrix as a product of two 
    # Y @ Z = X_bar
    data_mask = np.ones(data_dense.shape, dtype=np.bool)
    if Y_start is not None and Z_start is not None:
        Y = Y_start
        Z = Z_start
    else:
        Y = np.random.randn(rank, data_dense.shape[0])
        Z = np.random.randn(rank, data_dense.shape[1])

    # calculation speedup
    lam_I = lam * np.eye(rank)

    # alternating least squares until convergence
    prev_obj = loss(data_dense, Y, Z, data_mask, lam)
    converged = False

    print("Start objective:", prev_obj)

    prev_Y = Y.copy()
    prev_Z = Z.copy()
    for iteration in range(1, max_iteration + 1):
        if iteration % 10 == 0:
            print("Iteration:", iteration)
            print("Current objective:", prev_obj)

        # optimize Y based on current value of Z
        for col in range(Y.shape[1]):
            has_rating = data_mask[col]
            ratings = data_dense[col, has_rating]
            Z_relevant_columns = Z[:, has_rating]
            regularized_cov = np.sum([c.reshape([-1, 1]) @ c.reshape([1, -1]) for
                                    c in Z_relevant_columns.T], axis=0) + lam_I
            weighted_sum = np.sum(Z_relevant_columns * ratings, axis=1)
            Y[:, col] = np.linalg.inv(regularized_cov) @ weighted_sum

        # optimize Z based on current values of Y
        for col in range(Z.shape[1]):
            has_rating = data_mask[:, col]
            ratings = data_dense[has_rating, col]
            Y_relevant_columns = Y[:, has_rating]
            regularized_cov = np.sum([c.reshape([-1, 1]) @ c.reshape([1, -1]) for
                                    c in Y_relevant_columns.T], axis=0) + lam_I
            weighted_sum = np.sum(Y_relevant_columns * ratings, axis=1)
            Z[:, col] = np.linalg.inv(regularized_cov) @ weighted_sum

        obj = loss(data_dense, Y, Z, data_mask, lam)

        # convergence criteria. prevents division by 0.
        if abs(obj - prev_obj) / ( abs(prev_obj) + 1e-8 ) < TOLERANCE:
            converged = True
            break

        prev_obj = obj

    print("Converged:", converged)
    print("# iterations:", iteration)
    print("Final objective:", obj)

    # report final low-rank matrix
    return Y.T, Z.T

In [9]:
# Hyperparameters
latent_dim = 10
MAX_ITERATION = 1000
TOLERANCE = 1e-4


# Get 
rat,user_index_map,paper_index_map = generate_interaction_matrix()


U,P = approximate(rat, latent_dim, 1)

abstracts,new_map = get_abstracts_by_ids(paper_index_map)
dataset = ClickedDataset(abstracts, P,new_map)
data_loader = DataLoader(dataset, batch_size=10,shuffle=True)

model = paperBERT(latent_dim) # You can pass the parameters if required to have more flexible model
model.to(device) ## can be gpu

Start objective: 6972.09637619775
Converged: True
# iterations: 7
Final objective: 3.0


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


paperBERT(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0): LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_feat

In [None]:
model.train(data_loader,5)

torch.Size([10, 768])
torch.Size([10, 768])
