In [None]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd
import pickle


import torch

from sentence_transformers import SentenceTransformer
from summarizer import Summarizer

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(2891)
num_gpu = 1
if torch.cuda.device_count() > 1:
    num_gpu = torch.cuda.device_count()
print("Let's use", num_gpu, "GPUs!") # 1
print('device', device) # cuda

In [None]:
summary_model = Summarizer()
emb_model =  SentenceTransformer('all-MiniLM-L6-v2', device='cuda').to(device)

In [None]:
def cosine_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

In [None]:
def getItemReviewEmbDict_n(total_review,n,file_nm):
    pid_list = total_review['product_id'].unique().tolist()
    p_review_dict = {}
    for pid in pid_list:
        i_reviews = total_review[total_review['product_id']==pid]['review']
        tensor_list = []
        if len(i_reviews)<=n:
            for i in range(len(i_reviews)):
                review_tensor = emb_model.encode(i_reviews.iloc[i], convert_to_tensor=True)
                tensor_list.append(review_tensor)
        else:
            tot_reviews = ' '.join(train_df[train_df['product_id']==pid]['review'].tolist())
            summary_reviews = summary_model(tot_reviews)
            u_summary_array = emb_model.encode(summary_reviews)
            sim_dict={}
            for i in range(len(i_reviews)):
                review_array = emb_model.encode(i_reviews.iloc[i])
                cos_sim = cosine_sim(u_summary_array,review_array)
                review_tensor = torch.tensor(review_array).to(device)
                sim_dict[review_tensor] = cos_sim
            sorted_dict = sorted(sim_dict.items(),key = lambda item: item[1],reverse=True)[:n]
            tensor_list = [x for x,y in sorted_dict]
        i_mean_tensor = torch.mean(torch.stack(tensor_list, 0),0)
        p_review_dict[pid]=i_mean_tensor
    file_name = file_nm+'.item_rv_dict_bertsum'+str(n)+'.pickle'
    with open(file_name,'wb') as fw:
        pickle.dump(p_review_dict, fw)

In [None]:
train_df = pd.read_csv('data/patio.train_review.review', header=None,sep='\t', names=['user_id', 'product_id', 'review'])
total_df = pd.read_csv('data/patio.total_review.review', header=None,sep='\t', names=['product_id', 'review'])

In [None]:
getItemReviewEmbDict_n(total_df,80,'patio')