### Import packages

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import time
import os
from ast import literal_eval

import torch
from transformers import BertForMaskedLM, BertTokenizer

### Load Dataset

In [None]:
DATASET_PATH = '/content/drive/Shareddrives/AmorePacific2021/5.newcode/dataset/'
DATA_NAME = 'amore_data_above8_rating.csv'      ## Already preprocessed (Clean reviews)
df = pd.read_csv(DATASET_PATH + DATA_NAME, converters={'review_split': literal_eval})

## Reviews splitted by sentences
reviews = df.review_split.to_list()

### BERT embedding

In [None]:
## BERT embedding module
'''
Reference : https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#32-understanding-the-output
'''
class Embed_custom:
    def __init__(self, pretrain_ver='Kyoungmin/beauty-base-KLCP2'):     ## Domain Adaptation한 후 huggingface에 올린 pretrained-model 사용
        self.ver = pretrain_ver
        self.tokenizer = BertTokenizer.from_pretrained(self.ver)
        self.model = BertForMaskedLM.from_pretrained(self.ver, output_hidden_states = True)

    ## Tokenization
    def tokenization_custom(self, sent):
        marked_text = '[CLS]' + sent + '[SEP]'
        tokenized_text = self.tokenizer.tokenize(marked_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        ## 데이터(리뷰텍스트) 상 300자 넘는 리뷰(문장)이 거의 없어서 아래처럼 cutpoint 설정함
        if len(indexed_tokens) > 300 :
          cut_end = (len(indexed_tokens)-300)//2
          cut_start = len(indexed_tokens) - 300 - cut_end
          indexed_tokens = indexed_tokens[cut_start:(len(indexed_tokens)-cut_end)]

        return indexed_tokens

    ## Embedding
    def _transformer_custom(self, sent):
        indexed_tokens = self.tokenization_custom(sent)
        tokens_tensor = torch.tensor([indexed_tokens])
        with torch.no_grad():
            outputs = self.model(tokens_tensor)

        hidden_states = outputs[1]
        token_vecs = hidden_states[-2][0]    ## Followed tutorial (Reference)
        sent_embed = torch.mean(token_vecs, dim=0)
        return sent_embed

In [None]:
## Run Embedding
module = Embed_custom()

t0 = time.time()
rev_embed = []
for rev in tqdm(reviews):
  sent_embed = []
  for sent in rev:
    sent_embed.append(module._transformer_custom(sent))
  rev_embed.append(sent_embed)

print('Elapsed time (sec.) :', time.time()-t0)