In [1]:
import os 
os.chdir('../')


In [1]:
from baseline.data_helpers import Dataloader
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
import numpy as np



INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [3]:
help(BertTokenizer)

Help on class BertTokenizer in module transformers.models.bert.tokenization_bert:

class BertTokenizer(transformers.tokenization_utils.PreTrainedTokenizer)
 |  BertTokenizer(vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', tokenize_chinese_chars=True, strip_accents=None, **kwargs)
 |  
 |  Construct a BERT tokenizer. Based on WordPiece.
 |  
 |  This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
 |  this superclass for more information regarding those methods.
 |  
 |  Args:
 |      vocab_file (`str`):
 |          File containing the vocabulary.
 |      do_lower_case (`bool`, *optional*, defaults to `True`):
 |          Whether or not to lowercase the input when tokenizing.
 |      do_basic_tokenize (`bool`, *optional*, defaults to `True`):
 |          Whether or not to do basic tokenization before Wor

In [13]:
df_emo = pd.read_csv('../data/MELD/test_sent_emo.csv')
df_emo.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,Why do all you’re coffee mugs have numbers on ...,Mark,surprise,positive,0,0,3,19,"00:14:38,127","00:14:40,378"
1,2,Oh. That’s so Monica can keep track. That way ...,Rachel,anger,negative,0,1,3,19,"00:14:40,629","00:14:47,385"
2,3,Y'know what?,Rachel,neutral,neutral,0,2,3,19,"00:14:56,353","00:14:57,520"
3,19,"Come on, Lydia, you can do it.",Joey,neutral,neutral,1,0,1,23,"0:10:44,769","0:10:46,146"
4,20,Push!,Joey,joy,positive,1,1,1,23,"0:10:46,146","0:10:46,833"


In [14]:
example = df_emo['Utterance'][0]
print(example)

Why do all you’re coffee mugs have numbers on the bottom?


In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.tokenize(example)
#https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#1-loading-pre-trained-bert

['why',
 'do',
 'all',
 'you',
 '’',
 're',
 'coffee',
 'mug',
 '##s',
 'have',
 'numbers',
 'on',
 'the',
 'bottom',
 '?']

In [16]:
text = df_emo['Utterance']
emotion = df_emo['Emotion']

#https://www.kaggle.com/code/parulpandey/eda-and-preprocessing-for-bert/notebook 

input_ids = []
attention_masks = []
token_type_ids = []

for i in range(len(text)):
    encoded = tokenizer.encode_plus(
      emotion[i],
      text[i],
      add_special_tokens=True,
      max_length=512,
      pad_to_max_length=True,
      return_token_type_ids=True,
      return_attention_mask=True,
      return_tensors='pt'
    )
    
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])
    token_type_ids.append(encoded['token_type_ids'])



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)


In [18]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
embeddings = model.encode(sentences)
print(len(embeddings[0]))

768


In [19]:
from transformers import AutoTokenizer, AutoModel


#https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens 

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = list(df_emo['Utterance'])

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, max pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [20]:
f = open("embeddings/bert_embeddings_test", "w")

for i in sentence_embeddings:
    f.write(str(i.tolist()))
    f.write('\n')

f.close()

In [12]:
sentence_embeddings[0]

tensor([ 4.2722e-01,  6.2539e-01,  2.3328e+00,  2.6122e-01,  1.6353e-01,
        -2.1494e-01,  1.4670e+00,  5.9662e-01,  4.5410e-01,  1.1599e-01,
        -1.7619e+00,  1.7392e-01,  3.6471e-01,  6.6027e-01,  1.2162e+00,
         1.9265e-01, -5.7760e-01, -7.4396e-01, -3.4649e-01, -2.3700e-02,
         4.3137e-02, -4.4839e-01,  3.7725e-01, -9.7831e-01, -2.8835e-01,
        -3.5404e-01, -1.4802e-01, -2.1277e+00, -1.2721e+00,  3.2837e-01,
         2.9705e-01,  1.4452e-01,  8.0181e-01,  1.5111e-01,  3.9112e-01,
         7.0437e-01, -3.7708e-02, -1.0741e+00, -3.1062e-01, -2.8050e-02,
         1.1241e+00,  5.1460e-01,  4.8044e-01,  1.1836e-01,  1.8654e-01,
        -2.1464e-01,  1.4880e+00, -1.6614e-01,  2.5664e-01, -1.3234e+00,
        -6.0333e-01, -1.5087e-01,  1.3694e+00,  2.6422e-01, -5.0637e-01,
        -1.4389e+00,  2.3871e-01, -6.5537e-01,  3.3788e-01,  5.4328e-01,
         2.1489e-01,  2.1842e-01, -5.5088e-01,  6.0476e-01, -3.8081e-01,
         3.5775e-01,  2.5422e-01,  5.6050e-01, -6.8