In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
import os
from psycho_embeddings.fresh_embedder import ContextualizedEmbedder
import numpy as np
import pandas as pd
from glob import glob
import pickle
import gc
from tqdm import tqdm
from operator import itemgetter
from numpy import zeros, dtype, float32 as REAL, ascontiguousarray, fromstring
from gensim import utils
import gensim

  return torch._C._cuda_getDeviceCount() > 0


In [16]:
model = ContextualizedEmbedder("bert-base-cased", max_length=300, device="cpu")

loading configuration file config.json from cache at /home/vinid/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /home/vinid/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f746

In [17]:
def find_index_for_word(word, data):
    """
    Given a word and a dataframe, finds the idxs of that word in the dataframe
    """
    return data[data["words"] == word].index.tolist()

def get_average_word_embeddings(word, data, embeds):
    """
    Given a word, a data, and the embeddings, it averages the embeddings of that word
    """
    idxs = find_index_for_word(word, data)
    if len(idxs) > 1:
        return np.average(itemgetter(*idxs)(embeds), axis=0)
    else:
        return np.array(embeds[idxs[0]]) # idxs is a list of lists so we access the first element

In [18]:
data = pd.DataFrame({"words" : ["cat", "dog", "cat"], "target_text" : ["the cat is on the table", "the dog is on the table", "the cat is on the table"]})

In [19]:
SIZE_CHUNKS = 2  #chunk row size
FOLDER_NAME = "bert_embeddings"

In [20]:
layers_of_interest = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
list_df = [data[i:i+SIZE_CHUNKS] for i in range(0,data.shape[0],SIZE_CHUNKS)]

# Create Embeddings for The Enitre Dataset

In [21]:
pbar = tqdm(total=len(list_df), position=0)

for index, sub_portion_od_data in enumerate(list_df):

    #############################
    # DUMPING EMBEDDING ON DISK #
    #############################

    df_slice_embedded = embeddings = model.embed(
        words=sub_portion_od_data["words"].tolist(),
        layers_id=layers_of_interest,
        target_texts=sub_portion_od_data["target_text"].tolist(),
        batch_size=8,
        averaging=True,
        return_static=True,
        show_progress=True
    )

    for layer in [-1] + layers_of_interest:
        os.makedirs(f"{FOLDER_NAME}/{layer}/temp/", exist_ok=True)

        with open(f"{FOLDER_NAME}/{layer}/temp/bert_embeddings_{index}", "wb") as filino:
            pickle.dump((df_slice_embedded[layer]), filino)

    if index%10==0:
        gc.collect()
    pbar.update(1)

  0%|                                                       | 0/2 [00:00<?, ?it/s]

Text tokenization:   0%|          | 0/2 [00:00<?, ?ex/s]

100%|███████████████████████████████████████████████| 1/1 [00:00<00:00,  1.41it/s]
 50%|███████████████████████▌                       | 1/2 [00:01<00:01,  1.52s/it]

Text tokenization:   0%|          | 0/1 [00:00<?, ?ex/s]

100%|███████████████████████████████████████████████| 1/1 [00:00<00:00,  2.60it/s]
100%|███████████████████████████████████████████████| 2/2 [00:02<00:00,  1.18s/it]

# Reconstruct and Save Contextualzied Embeddings

In [22]:
for LAYER in tqdm(range(-1, 13), desc="Layer"):
    # We load all the embeddings from disk, in order and reconstruct the actual embedding for a specific layer for the entire dataframe.

    emb_files = sorted(glob(f"{FOLDER_NAME}/{LAYER}/temp/*"), key=lambda x: int(os.path.basename(x).split("_")[-1]))
    assert len(emb_files) == len(list_df) # sanity check

    all_the_embeddings = []
    pbar = tqdm(total=len(list_df), position=0)

    for ff in emb_files:
        with open(ff, "rb") as filino:
            ldata = pickle.load(filino)
            pbar.update(1)
            for value in ldata:
                if len(value) == 1:
                    all_the_embeddings.append(np.array(value[0]))
                else:
                    all_the_embeddings.append(np.array(value))
    pbar.close()

    all_the_embeddings = np.array(all_the_embeddings)


    with open(f'{FOLDER_NAME}/contextualized_embeddings_bert_{LAYER}_layer.npy', 'wb') as f:
        np.save(f, all_the_embeddings)

    del all_the_embeddings

    ##################
    # MAP 2 Sentence #
    ##################

    # NOTE:
    # NOTE: This is probably dataset specific? but also, do we really need this? seems to be only the index dumped on disk?
    # NOTE:

    map_sentrepl2emb = {
        (row["words"], row["words"], row["target_text"], idx): idx for idx, row in data.iterrows()
    }

    with open(f"{FOLDER_NAME}/map_sentrepl2embbert_{LAYER}.pkl", "wb") as file_to_save:
        pickle.dump(map_sentrepl2emb, file_to_save)




100%|███████████████████████████████████████████████| 2/2 [00:02<00:00,  1.24s/it][A
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 1636.16it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 5949.37it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 6100.81it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 5226.55it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 5979.05it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 4996.19it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 4657.75it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 5793.24it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 6168.09it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 7002.18it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 5136.93it/s]


### Prototype Embeddings

In [23]:
for LAYER in tqdm(range(-1, 13), desc="Layer"):
    #emb_files = sorted(glob(f"{FOLDER_NAME}/{LAYER}/temp/*"), key=lambda x: int(os.path.basename(x).split("_")[-1]))
    #assert len(emb_files) == len(list_df) # sanity check

    ##############################
    # Build Prototype Embeddings #
    ##############################

    embeds = np.load(f"{FOLDER_NAME}/contextualized_embeddings_bert_{LAYER}_layer.npy")

    mega_embeddings = {}
    pbar = tqdm(total=len(data["words"].unique()), position=0)
    for word in data["words"].unique():
        emb = get_average_word_embeddings(word, data, embeds)
        mega_embeddings[word] = emb
        pbar.update(1)
    pbar.close()
    
    
    m = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=768)
    m.add_vectors(list(mega_embeddings.keys()), list(mega_embeddings.values()))
    m.save_word2vec_format(f"{FOLDER_NAME}/gensim_prototype_embeddings_bert_{LAYER}.bin")

    with open(f"{FOLDER_NAME}/prototype_embeddings_bert_{LAYER}.pkl", "wb") as filino:
        pickle.dump(mega_embeddings, filino)


100%|██████████████████████████████████████████████| 2/2 [00:00<00:00, 412.87it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 2077.93it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 2470.87it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 1503.87it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 2465.06it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 2109.28it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 1882.54it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 2607.59it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 1652.93it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 1065.36it/s]
100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 1188.69it/s]
100%|██████████████████████████████████████████████| 2/2 [00:00<00:00, 812.14it/s]
100%

### Non Contextualized Embeddings

#### Method 1

In [37]:
words_to_embed = data["words"].unique().tolist()

In [34]:
df_slice_embedded = embeddings = model.embed(
    words=words_to_embed,
    layers_id=layers_of_interest,
    target_texts=words_to_embed,
    batch_size=8,
    averaging=True,
    return_static=True,
    show_progress=True
)

Text tokenization:   0%|          | 0/2 [00:00<?, ?ex/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.62it/s]


#### Method 2

In [50]:
for layer in [-1] + layers_of_interest:
    mega_embeddings = {}
    for index,i in enumerate(words_to_embed):
        
        mega_embeddings[i] = df_slice_embedded[layer][index]
    
    
    m = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=768)
    m.add_vectors(list(mega_embeddings.keys()), list(mega_embeddings.values()))
    m.save_word2vec_format(f"{FOLDER_NAME}/gensim_non_contextual_prototype_embeddings_bert_{LAYER}.bin")

In [59]:
embeddings = model.embed(
        words=data["words"].unique().tolist(),
        layers_id=layers_of_interest,
        batch_size=8,
        averaging=True,
        return_static=True,
        show_progress=True,
        add_special_tokens=True
    )

Target texts is None: extracting non contextualized embeddings.


Text tokenization:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|                                                                                                                                              | 0/1 [00:00<?, ?it/s]


In [None]:
for layer in [-1] + layers_of_interest:
    mega_embeddings = {}
    for index,i in enumerate(words_to_embed):
        
        mega_embeddings[i] = embeddings[layer][index]
    
    
    m = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=768)
    m.add_vectors(list(mega_embeddings.keys()), list(mega_embeddings.values()))
    m.save_word2vec_format(f"{FOLDER_NAME}/gensim_non_contextual_prototype_embeddings_bert_{LAYER}.bin")