In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from psycho_embeddings.fresh_embedder import ContextualizedEmbedder
import numpy as np
import pandas as pd
from glob import glob
import pickle
import gc
from tqdm import tqdm
from operator import itemgetter

comet_ml is installed but `COMET_API_KEY` is not set.


In [3]:
model = ContextualizedEmbedder("bert-base-cased", max_length=300, device="cpu")

loading configuration file config.json from cache at /Users/attanasiog/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /Users/attanasiog/.cache/huggingface/hub/models--bert-base-cased/snapshots/

In [4]:
def find_index_for_word(word, data):
    """
    Given a word and a dataframe, finds the idxs of that word in the dataframe
    """
    return data[data["words"] == word].index.tolist()

def get_average_word_embeddings(word, data, embeds):
    """
    Given a word, a data, and the embeddings, it averages the embeddings of that word
    """
    idxs = find_index_for_word(word, data)
    if len(idxs) > 1:
        return np.average(itemgetter(*idxs)(embeds), axis=0)
    else:
        return np.array(embeds[idxs[0]]) # idxs is a list of lists so we access the first element

In [5]:
data = pd.DataFrame({"words" : ["cat", "dog", "cat"], "target_text" : ["the cat is on the table", "the dog is on the table", "the cat is on the table"]})

In [6]:
SIZE_CHUNKS = 2  #chunk row size
FOLDER_NAME = "bert_embeddings"

In [7]:
layers_of_interest = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
list_df = [data[i:i+SIZE_CHUNKS] for i in range(0,data.shape[0],SIZE_CHUNKS)]

In [8]:
list_df

[  words              target_text
 0   cat  the cat is on the table
 1   dog  the dog is on the table,
   words              target_text
 2   cat  the cat is on the table]

# Create Embeddings for The Enitre Dataset

In [30]:
pbar = tqdm(total=len(list_df), position=0)

for index, sub_portion_od_data in enumerate(list_df):

    #############################
    # DUMPING EMBEDDING ON DISK #
    #############################

    df_slice_embedded = embeddings = model.embed(
        sub_portion_od_data["target_text"].tolist(),
        sub_portion_od_data["words"].tolist(),
        layers_of_interest,
        batch_size=8,
        averaging=True,
        return_static=True,
        show_progress=True
    )

    for layer in layers_of_interest:
        os.makedirs(f"{FOLDER_NAME}/{layer}/temp/", exist_ok=True)

        with open(f"{FOLDER_NAME}/{layer}/temp/bert_embeddings_{index}", "wb") as filino:
            pickle.dump((df_slice_embedded[layer]), filino)

    if index%10==0:
        gc.collect()
    pbar.update(1)

100%|██████████| 2/2 [01:37<00:00, 48.66s/it]


Text tokenization:   0%|          | 0/2 [00:00<?, ?ex/s]

 50%|█████     | 1/2 [00:00<00:00,  1.82it/s]

Text tokenization:   0%|          | 0/1 [00:00<?, ?ex/s]

100%|██████████| 2/2 [00:00<00:00,  2.35it/s]

In [26]:
len(embeddings)

14

In [27]:
embeddings[-1][0].shape

(768,)

# Reconstruct and Save Contextualzied Embeddings

In [12]:
for LAYER in tqdm(range(13), desc="Layer"):
    # We load all the embeddings from disk, in order and reconstruct the actual embedding for a specific layer for the entire dataframe.
    
    emb_files = sorted(glob(f"{FOLDER_NAME}/{LAYER}/temp/*"), key=lambda x: int(os.path.basename(x).split("_")[-1]))
    assert len(emb_files) == len(list_df) # sanity check
    
    all_the_embeddings = []
    pbar = tqdm(total=len(list_df), position=0)
    
    for ff in emb_files:
        with open(ff, "rb") as filino:
            ldata = pickle.load(filino)
            pbar.update(1)
            for value in ldata:
                if len(value) == 1:
                    all_the_embeddings.append(np.array(value[0]))
                else:
                    all_the_embeddings.append(np.array(value))
    pbar.close()
    
    all_the_embeddings = np.array(all_the_embeddings)
    

    with open(f'{FOLDER_NAME}/contextualized_embeddings_bert_{LAYER}_layer.npy', 'wb') as f:
        np.save(f, all_the_embeddings)

    del all_the_embeddings

    embeds = np.load(f"{FOLDER_NAME}/contextualized_embeddings_bert_{LAYER}_layer.npy")

    ##################
    # MAP 2 Sentence #
    ##################
        
    # NOTE:    
    # NOTE: This is probably dataset specific? but also, do we really need this? seems to be only the index dumped on disk?
    # NOTE:
    
    map_sentrepl2emb = {
        (row["words"], row["target_text"]): idx for idx, row in data.iterrows()
    }

    with open(f"{FOLDER_NAME}/map_sentrepl2embbert_{LAYER}.pkl", "wb") as file_to_save:
        pickle.dump(map_sentrepl2emb, file_to_save)

    


100%|██████████| 2/2 [00:03<00:00,  1.55s/it][A
100%|██████████| 2/2 [00:00<00:00, 1092.27it/s]
100%|██████████| 2/2 [00:00<00:00, 3813.00it/s]
100%|██████████| 2/2 [00:00<00:00, 4306.27it/s]
100%|██████████| 2/2 [00:00<00:00, 5581.24it/s]
100%|██████████| 2/2 [00:00<00:00, 10459.61it/s]
100%|██████████| 2/2 [00:00<00:00, 6043.67it/s]
100%|██████████| 2/2 [00:00<00:00, 5645.09it/s]
100%|██████████| 2/2 [00:00<00:00, 1370.46it/s]
100%|██████████| 2/2 [00:00<00:00, 2080.51it/s]
100%|██████████| 2/2 [00:00<00:00, 3760.02it/s]
100%|██████████| 2/2 [00:00<00:00, 3629.86it/s]
100%|██████████| 2/2 [00:00<00:00, 6154.52it/s]
100%|██████████| 2/2 [00:00<00:00, 4440.77it/s]
Layer: 100%|██████████| 13/13 [00:00<00:00, 206.27it/s]


### Prototype Embeddings

In [17]:
for LAYER in tqdm(range(13), desc="Layer"):
    emb_files = sorted(glob(f"{FOLDER_NAME}/{LAYER}/temp/*"), key=lambda x: int(os.path.basename(x).split("_")[-1]))
    assert len(emb_files) == len(list_df) # sanity check

    ##############################
    # Build Prototype Embeddings #
    ##############################
    
    embeds = np.load(f"{FOLDER_NAME}/contextualized_embeddings_bert_{LAYER}_layer.npy")

    mega_embeddings = {}
    pbar = tqdm(total=len(data["words"].unique()), position=0)
    for word in data["words"].unique():
        emb = get_average_word_embeddings(word, data, embeds)
        mega_embeddings[word] = emb 
        pbar.update(1)
    pbar.close()

    with open(f"{FOLDER_NAME}/prototype_embeddings_bert_{LAYER}.pkl", "wb") as filino:
        pickle.dump(mega_embeddings, filino)



100%|██████████| 2/2 [00:00<00:00, 889.66it/s][A
100%|██████████| 2/2 [00:00<00:00, 1416.28it/s]
100%|██████████| 2/2 [00:00<00:00, 1435.67it/s]
100%|██████████| 2/2 [00:00<00:00, 1649.35it/s]
100%|██████████| 2/2 [00:00<00:00, 1368.67it/s]
100%|██████████| 2/2 [00:00<00:00, 1586.65it/s]
100%|██████████| 2/2 [00:00<00:00, 2298.88it/s]
100%|██████████| 2/2 [00:00<00:00, 1257.29it/s]
100%|██████████| 2/2 [00:00<00:00, 1247.19it/s]
100%|██████████| 2/2 [00:00<00:00, 1369.35it/s]
100%|██████████| 2/2 [00:00<00:00, 1108.43it/s]
100%|██████████| 2/2 [00:00<00:00, 1082.68it/s]
100%|██████████| 2/2 [00:00<00:00, 957.60it/s]
Layer: 100%|██████████| 13/13 [00:00<00:00, 168.43it/s]
