In [None]:
from glob import glob
import pickle
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import warnings
warnings.filterwarnings("ignore")

In [None]:
# # # Create an empty dict to store embeddings
# # # embeddings_full = {}

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True, 
                                          padding='max_length', 
                                          truncation=True)

# Get the stereotypes we are interested in
f = open("stereotypes.json")
stereotypes = json.load(f)
f.close()

# List all files in the BERT folder
results = glob('../raw/BERT/*.csv')

# Open dict
with open('bert_embeddings.pickle', 'rb') as filename:
    embeddings_full = pickle.load(filename)
    
# embeddings_full.keys()


In [None]:
for r in results[len(embeddings_full.keys())-1:]:
    day = r[-14:-4]
    df = pd.read_csv(r, lineterminator='\n')
    df = df[df.lemma_length > 0]
    corpus = df.bert_lemma.tolist()
    if day not in embeddings_full.keys():
        embeddings_full[day] = {}
    embeddings_full[day]['database'] = []
    embeddings_full[day]['category'] = []
    embeddings_full[day]['word'] = []
    embeddings_full[day]['vectors'] = []
    for sentence in corpus:
        # Tokenize our sentence with the BERT tokenizer.
        tokenized_text = tokenizer.tokenize(sentence)

        # Map the token strings to their vocabulary indices.
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        segments_ids = [1] * len(tokenized_text)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # Load pre-trained model (weights)
        model = BertModel.from_pretrained('bert-base-uncased',
                                          output_hidden_states = True) # Whether the model returns all hidden-states.

        # Put the model in "evaluation" mode, meaning feed-forward operation.
        model.eval()

        # Run the text through BERT, and collect all of the hidden states produced from all 12 layers. 
        with torch.no_grad():

            outputs = model(tokens_tensor, segments_tensors)

            # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier
            # In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers
            # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
            hidden_states = outputs[2]

            # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
            token_embeddings = torch.stack(hidden_states, dim=0)

            # Remove dimension 1, the "batches".
            token_embeddings = torch.squeeze(token_embeddings, dim=1)

            # Swap dimensions 0 and 1.
            token_embeddings = token_embeddings.permute(1,0,2)

            # Stores the token vectors with different shape, concatenating the last 4 layers
            token_vecs_cat = []

            for token in token_embeddings:

                # For each token in the sentence, concatenate the vectors from the last four layers
                cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

                # Use `cat_vec` to represent `token`.
                token_vecs_cat.append(cat_vec)

        for i in np.arange(len(tokenized_text)):
            for key, value in stereotypes.items():
                if tokenized_text[i] in ['Asians', 'asians', 'Asian', 'asian', 'Chinese', 'chinese', 'White', 'white', 'Black', 'black', 'Hispanic', 'hispanic', 'Latino', 'latino', 'Latina', 'latina', 'Latinx', 'latinx', 'Latine', 'latine']: 
                    embeddings_full[day]['database'].append('bert')
                    embeddings_full[day]['category'].append('Group')
                    embeddings_full[day]['word'].append(tokenized_text[i])
                    embeddings_full[day]['vectors'].append(token_vecs_cat[i])
                elif tokenized_text[i] in value:
                    embeddings_full[day]['database'].append('bert')
                    embeddings_full[day]['category'].append(key)
                    embeddings_full[day]['word'].append(tokenized_text[i])
                    embeddings_full[day]['vectors'].append(token_vecs_cat[i])
        


In [None]:
with open('bert_embeddings.pickle', 'wb') as filename:
    pickle.dump(embeddings_full, filename)

In [None]:
len(embeddings_full.keys())

In [None]:
# token_vecs_sum = []

# for token in token_embeddings:

    # Sum the vectors from the last four layers
    # sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`
    # token_vecs_sum.append(sum_vec)
