In [None]:
# CREATING WORD EMBEDDINGS FOR ONTOLOGY USE
# --------------------------------------------------
#
# Adapted by Jonathan IJbema
#
#
# Transform reviews into wordembeddings and save them in a json-style text file.
# 1. Choose between models: (1) BERT, (2) RoBERTa
# 2. Load dataset in special format. Reviews must be separated by ,|,
# 3. Remove negations from dataset
# 4. Create word embeddings
# 5. Save word embeddings to json-style text file

In [None]:
#@title Model choice
#@markdown Choose which model to use for tokenizing and embedding.

modelChoice = 'RoBERTa' # @param ["BERT", "RoBERTa"]


In [None]:
# Install and import libraries
#!pip install transformers

import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, RobertaTokenizer, RobertaModel
import pandas as pd
import numpy as np
import time
import random
import pprint
import sys
import json


if modelChoice == 'BERT':
  mc = 1
else:
  mc = 2

# Load pre-trained model tokenizer (vocabulary)
if mc == 1:
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  print("Model choice: BERT. Important libraries and models imported.")
elif mc == 2:
  tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
  print("Model choice: RoBERTa. Important libraries and models imported.")
else:
  raise Exception("No tokenizer chosen. Choose between 1 and 2.")

Model choice: RoBERTa. Important libraries and models imported.


In [None]:
# Load data for creating word embeddings

#@title File path
#@markdown Choose file path for dataset.

file_path = 'data/restData5k.txt' #@param {type:"string"}

i=0
reviews=[]
file = open(file_path, 'r', encoding="utf-8")
review = file.read()
review = review.replace("\n", ' ')

#Reviews are split on ,|,
wrong_review = review.split(",|,") 
for w in wrong_review:
    i += 1
    reviews.append(w)
reviews = reviews[0:2000] # Choose subset from reviews
#reviews = sample(reviews, 2000) # Choose random subset from reviews
print("Number of reviews : ", len(reviews))

Number of reviews :  2000


In [None]:
# Code for removing negation words in same sentence
i = 0
review = 0
sentC=0
reviewsNew = []
neg = " not "
neg2 = " nothing "
neg3 = "never "
neg4 = " didn\'t"
neg5 = " wouldn\'t"
neg6 = " don\'t"
neg7 = " can\'t"
neg8 = " doesn\'t"
neg9 = " coudn't"
case=0
for z in reviews:
    sent2New = []
    sentNew= ""
    sent = z.split('.')
    case = 0
    for j in sent:
        sent2 = j.split('!')
        for l in sent2:
            sentC+=1
            if (neg in l) or (neg2 in l) or (neg3 in l) or (neg4 in l) or (neg5 in l) or (neg6 in l) or (neg7 in l) or (neg8 in l) or (neg9 in l):
                i += 1
                sent2.remove(l)
                if case == 0:
                    review += 1
                    case = 1
        sent2New.append("!".join(sent2))
    #print(sent2New)
    sentNew = (".".join(sent2New))
    reviewsNew.append(str(sentNew))

print("Number of sentences with negation word:", i)
print("Number of reviews with these negation word:" ,review)
print("Number of sentences in text: ", sentC)
print("Number of sentences in text after removing sentences: ", sentC - i )

Number of sentences with negation word: 885
Number of reviews with these negation word: 687
Number of sentences in text:  12194
Number of sentences in text after removing sentences:  11309


In [None]:
# Load pre-trained model (weights)

#@title Model configuration
#@markdown Choose model configuration.
finetuned = True #@param {type:"boolean"}
posttrained = False #@param {type:"boolean"}

if mc == 1:
  if posttrained:
    if finetuned:
      model = BertModel.from_pretrained('models/finetunedPT', output_hidden_states = True,)
    else:
      model = BertModel.from_pretrained('models/posttrained', output_hidden_states = True,)
  else:
    if finetuned:
      model = BertModel.from_pretrained('models/finetuned', output_hidden_states = True,)
    else:
      model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)
  print("BERT model is downloaded.")
elif mc == 2:
  if finetuned:
    model = RobertaModel.from_pretrained('models/Roberta_finetuned', output_hidden_states = True,)
  else:
    model = RobertaModel.from_pretrained('roberta-base', output_hidden_states = True,)
  print("RoBERTa model is downloaded.")

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()


Some weights of the model checkpoint at DCWEB/Roberta_finetuned were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at DCWEB/Roberta_finetuned and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa model is downloaded.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [None]:
#Create word embeddings
start = time.time()
intermediate_time = start
bert_vectors = []
review_counter = 0
j = 1
#loops over the reviews
for rev in reviews:
    rev = rev.lower().replace('\n', '').replace('\r', '').strip()
    update = 20
    if review_counter % update == 0:
      start_time = intermediate_time
      intermediate_time = time.time()
      embedding_time = (intermediate_time - start_time)
      if review_counter < 40:
        average = embedding_time
      else:
        average = (average*(review_counter/20 - 1) + embedding_time) / (review_counter/20)
      ETA = average*(len(reviews)/update-review_counter/update)
      print("Estimated remaining time: ", str(round(ETA/60, 1)), " minutes.")
      print("Embedding review: " + str(review_counter))

    result = []
    tokenized_text = tokenizer.tokenize(rev)
    #change to 512 or shorter
    if len(tokenized_text)>=512:
        del tokenized_text[512:len(tokenized_text)]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    if len(tokenized_text) == 0 :
        print("Word with 0 length.")
        continue
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs.hidden_states
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    token_vecs_sum = []
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)
    token_vecs = hidden_states[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    #get all results for one review and display
    # replace with whole vector!
    i=0
    veccieTemp = []
    while i < len(tokenized_text):
        veccie = [round(vec,4) for vec in token_vecs_sum[i].tolist()]
        if mc == 1:
          if not tokenized_text[i].isalpha():  # remove all non alphabetic characters
            i += 1
            continue
          result.append([tokenized_text[i],indexed_tokens[i],segments_ids[i], veccie, review_counter])
        else:
          text = tokenizer.decode(indexed_tokens[i]).strip()
          if text == 'amb':
            veccieTemp = veccie #ambience is split up by RoBERTa. This makes sure ambience will be added to the word vectors
            i += 1
            continue
          if text == 'ience':
            veccie = np.array(veccie)
            veccieTemp = np.array(veccieTemp)
            if (veccieTemp.shape[0] > 0):
              veccie = np.vstack((veccie, veccieTemp))
            else:
              i += 1
              continue
            veccie = np.mean(veccie, axis=0)
            result.append(['ambience',indexed_tokens[i],segments_ids[i], [round(num, 4) for num in veccie], review_counter])
            i += 1
            continue
          if not text.isalpha():  # remove all non alphabetic characters
            i += 1
            continue
          result.append([text,indexed_tokens[i],segments_ids[i], veccie, review_counter])
        i += 1
    
    bert_vectors.append(result)

    del result
    review_counter+=1

end = time.time()
print("Time: ", end-start, "s")
print("Reviews are tokenized and put into vectors!")

Estimated remaining time:  0.0  minutes.
Embedding review: 0
Estimated remaining time:  4.8  minutes.
Embedding review: 20
Estimated remaining time:  4.3  minutes.
Embedding review: 40
Estimated remaining time:  4.5  minutes.
Embedding review: 60
Estimated remaining time:  4.3  minutes.
Embedding review: 80
Estimated remaining time:  4.3  minutes.
Embedding review: 100
Estimated remaining time:  4.3  minutes.
Embedding review: 120
Estimated remaining time:  4.2  minutes.
Embedding review: 140
Estimated remaining time:  4.2  minutes.
Embedding review: 160
Estimated remaining time:  4.2  minutes.
Embedding review: 180
Estimated remaining time:  4.2  minutes.
Embedding review: 200
Estimated remaining time:  4.2  minutes.
Embedding review: 220
Estimated remaining time:  4.2  minutes.
Embedding review: 240
Estimated remaining time:  4.1  minutes.
Embedding review: 260
Estimated remaining time:  4.1  minutes.
Embedding review: 280
Estimated remaining time:  4.0  minutes.
Embedding review: 30

In [None]:
#Create good format for ontology
#Change list1 to the location of bert vectors
start = time.time()
list1 = bert_vectors
words = {}
j=1
counterRev = 0
for rev in list1:
    counterRev += 1
    for word in rev:
        string1 = str(word[0])
        words[j] = {'word': string1,
                    'vector': word[3]
                    ,'sentence id': word[4]
                   }
        j = j+1
        
print('Number of reviews done: ', counterRev, ', with ', j, ' words in total.')

Number of reviews done:  2000 , with  107045  words in total.


In [None]:
#Save file

#@title Output file
#@markdown Choose output file name.

file_name = "RobertaFT.txt" #@param {type:"string"}

import json
start = time.time()
with open(file_name, 'w') as outfile:
    json.dump(words, outfile)
end = time.time()
print("Time: ", end-start, "s")
print("Vectors are saved in " + file_name)

Time:  170.22080278396606 s
Vectors are saved in RobertaFT.txt


In [None]:
# Remove everything from memory
import gc
del words
del bert_vectors
del list1
gc.collect()

32