In [None]:
# CREATING WORD EMBEDDINGS FOR ONTOLOGY USE
# --------------------------------------------------
#
# Adapted by Jonathan IJbema
#
#
# Transform reviews into T5 wordembeddings and save them in a json-style text file.
# 2. Load dataset in special format. Reviews must be separated by ,|,
# 3. Remove negations from dataset
# 4. Create word embeddings
# 5. Save word embeddings to json-style text file

In [None]:
# Install and import libraries
#!pip install transformers

import torch
from transformers import T5Tokenizer, T5Model
import pandas as pd
import numpy as np
import time
import random
import pprint
import sys
import json
from string import digits

# Load pre-trained model tokenizer (vocabulary)
tokenizer = T5Tokenizer.from_pretrained('t5-base')
print("Model choice: T5. Important libraries and models imported.")

Model choice: T5. Important libraries and models imported.


In [None]:
# Load data for creating word embeddings

i=0
reviews=[]
file = open('DCWEB/data/restData5k.txt', 'r', encoding="utf-8")
review = file.read()
review = review.replace("\n", ' ')

#Reviews are split on ,|,
wrong_review = review.split(",|,") 
for w in wrong_review:
    i += 1
    reviews.append(w)
reviews = reviews[0:1900] # Choose subset from reviews
#reviews = sample(reviews, 1600) # Choose random subset from reviews
print("Number of reviews : ", len(reviews))

Number of reviews :  1900


In [None]:
# Code for removing negation words in same sentence
i = 0
review = 0
sentC=0
reviewsNew = []
neg = " not "
neg2 = " nothing "
neg3 = "never "
neg4 = " didn\'t"
neg5 = " wouldn\'t"
neg6 = " don\'t"
neg7 = " can\'t"
neg8 = " doesn\'t"
neg9 = " coudn't"
case=0
for z in reviews:
    sent2New = []
    sentNew= ""
    sent = z.split('.')
    case = 0
    for j in sent:
        sent2 = j.split('!')
        for l in sent2:
            sentC+=1
            if (neg in l) or (neg2 in l) or (neg3 in l) or (neg4 in l) or (neg5 in l) or (neg6 in l) or (neg7 in l) or (neg8 in l) or (neg9 in l):
                i += 1
                sent2.remove(l)
                if case == 0:
                    review += 1
                    case = 1
        sent2New.append("!".join(sent2))
    #print(sent2New)
    sentNew = (".".join(sent2New))
    reviewsNew.append(str(sentNew))

print("Number of sentences with negation word:", i)
print("Number of reviews with these negation word:" ,review)
print("Number of sentences in text: ", sentC)
print("Number of sentences in text after removing sentences: ", sentC - i )

Number of sentences with negation word: 846
Number of reviews with these negation word: 655
Number of sentences in text:  11590
Number of sentences in text after removing sentences:  10744


In [None]:
# Load pre-trained model (weights)
#model = T5Model.from_pretrained('t5-base', output_hidden_states = True,)                                #pretrained
model = T5Model.from_pretrained('DCWEB/T5finetuned', output_hidden_states = True,)                          #finetuned
print("T5 model is downloaded.")

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()


Some weights of the model checkpoint at DCWEB/T5finetuned were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


T5 model is downloaded.


T5Model(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dropout(p=0.1, inplac

In [None]:
#Create word embeddings
start = time.time()
intermediate_time = start
bert_vectors = []
review_counter = 0
j = 1
#loops over the reviews
for rev in reviews:
    rev = rev.lower().replace('\n', '').replace('\r', '').strip()
    update = 20
    if review_counter % update == 0:
      start_time = intermediate_time
      intermediate_time = time.time()
      embedding_time = (intermediate_time - start_time)
      if review_counter < 40:
        average = embedding_time
      else:
        average = (average*(review_counter/20 - 1) + embedding_time) / (review_counter/20)
      ETA = average*(len(reviews)/update-review_counter/update)
      print("Estimated remaining time: ", str(round(ETA/60, 1)), " minutes.")
      print("Embedding review: " + str(review_counter))

    result = []
    tokenized_text = tokenizer.tokenize(rev)
    t5tok = tokenizer(rev, return_tensors="pt")
    #change to 512 or shorter
    if len(tokenized_text)>=512:
        del tokenized_text[512:len(tokenized_text)]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    for word in tokenized_text:
        if len(word) == 0 :
          print("Word with 0 length.")
          continue
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        outputs = model.encoder(input_ids=t5tok["input_ids"], attention_mask=t5tok["attention_mask"], return_dict=True)
        hidden_states = outputs.hidden_states
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    token_vecs_sum = []
    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)
    token_vecs = hidden_states[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    #get all results for one review and display
    # replace with whole vector!
    i=0
    tok_text = t5tok["input_ids"].tolist()[0]
    while i < len(tok_text) - 1:
      veccie = [round(vec,4) for vec in token_vecs_sum[i].tolist()]
      text = tokenizer.decode(tok_text[i]).strip()
      if not text.isalnum():
        i += 1
        continue
      result.append([text, veccie, review_counter])
      i += 1
    
    bert_vectors.append(result)

    del result
    review_counter+=1

end = time.time()
print("Time: ", end-start, "s")
print("Reviews are tokenized and put into vectors!")

Estimated remaining time:  0.0  minutes.
Embedding review: 0
Estimated remaining time:  5.0  minutes.
Embedding review: 20
Estimated remaining time:  4.5  minutes.
Embedding review: 40
Estimated remaining time:  4.8  minutes.
Embedding review: 60
Estimated remaining time:  4.7  minutes.
Embedding review: 80
Estimated remaining time:  4.6  minutes.
Embedding review: 100
Estimated remaining time:  4.6  minutes.
Embedding review: 120
Estimated remaining time:  4.5  minutes.
Embedding review: 140
Estimated remaining time:  4.5  minutes.
Embedding review: 160
Estimated remaining time:  4.6  minutes.
Embedding review: 180
Estimated remaining time:  4.6  minutes.
Embedding review: 200
Estimated remaining time:  4.5  minutes.
Embedding review: 220
Estimated remaining time:  4.6  minutes.
Embedding review: 240
Estimated remaining time:  4.4  minutes.
Embedding review: 260
Estimated remaining time:  4.4  minutes.
Embedding review: 280
Estimated remaining time:  4.3  minutes.
Embedding review: 30

In [None]:
#Create good format for ontology
#Change list1 to the location of bert vectors
start = time.time()
list1 = bert_vectors
words = {}
j=1
counterRev = 0
for rev in list1:
    counterRev += 1
    for word in rev:
        string1 = str(word[0])
        words[j] = {'word': string1,
                    'vector': word[1]
                    ,'sentence id': word[2]
                   }
        j = j+1
        
print('Number of reviews done: ', counterRev, ', with ', j, ' words in total.')

Number of reviews done:  1900 , with  106957  words in total.


In [None]:
#Save file
name_file = "T5FT.txt"

import json
start = time.time()
with open(name_file, 'w') as outfile:
    json.dump(words, outfile)
end = time.time()
print("Time: ", end-start, "s")
print("Vectors are saved in " + name_file)

Time:  167.4676809310913 s
Vectors are saved in T5FT.txt


In [None]:
# Remove everything from memory
import gc
del words
del bert_vectors
del list1
gc.collect()

32