In [1]:
import logging
import pandas as pd

logger = logging.getLogger(__name__)

In [2]:
outputs = pd.read_parquet('outputs/0S_100T_all_models_202345.parquet')
outputs.head(1)

Unnamed: 0,id,full_text,bloom,alpaca_3b,alpaca_770m,llama_13b,gpt4all,llama_7b
0,1642025557511532545,"the white paws, the cute collar, the tongue, t...","Yes, the above tweet is a potential lead. The...","Yes, this tweet is a potential lead because it...","Yes, the tweet is a potential lead because it ...",1. What is your analysis of the tweet in terms...,"\n\nAnswer:\n\nYes, the above tweet is a poten...",Yes. The reason is that it has all the charac...


# Embeddings

In [3]:
model_cols = outputs.columns[2:].to_list()
print(f'Models: {model_cols}')

Models: ['bloom', 'alpaca_3b', 'alpaca_770m', 'llama_13b', 'gpt4all', 'llama_7b']


## Sentence embeddings

Using sentence embeddings out of the box.

In [4]:
from models.embeddings import SentenceEmbeddings

sentence_embeddings_df = outputs.copy()

2023-04-20 17:30:31,401 - INFO     | config     | Loading environment variables


### mpnet

In [5]:
mpnet = SentenceEmbeddings(name='mpnet')

2023-04-20 17:30:48,059 - INFO     | models.embeddings | Initializing MPNET for Sentence Embeddings
2023-04-20 17:30:48,060 - INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-04-20 17:30:48,927 - INFO     | sentence_transformers.SentenceTransformer | Use pytorch device: cpu


In [6]:
mpnet_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    mpnet_dict[col] = mpnet.generate_embeddings(input_texts=sentence_embeddings_df[col])

2023-04-20 17:30:48,946 - INFO     | __main__   | Parsing model: bloom
2023-04-20 17:30:48,949 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:30:49,930 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-20 17:30:49,930 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:30:52,294 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-20 17:30:52,295 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:30:54,533 - INFO     | __main__   | Parsing model: llama_13b
2023-04-20 17:30:54,534 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:31:03,362 - INFO     | __main__   | Parsing model: gpt4all
2023-04-20 17:31:03,362 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:31:07,358 - INFO     | __main__   | Parsing model: llama_7b
2023-04-20 17:31:07,359 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

### distil-roberta

In [7]:
distilrberta = SentenceEmbeddings(name='distil-roberta')

2023-04-20 17:31:15,395 - INFO     | models.embeddings | Initializing DISTIL-ROBERTA for Sentence Embeddings
2023-04-20 17:31:15,395 - INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: all-distilroberta-v1
2023-04-20 17:31:16,058 - INFO     | sentence_transformers.SentenceTransformer | Use pytorch device: cpu


In [8]:
distil_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    distil_dict[col] = distilrberta.generate_embeddings(input_texts=sentence_embeddings_df[col])

2023-04-20 17:31:16,078 - INFO     | __main__   | Parsing model: bloom
2023-04-20 17:31:16,079 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:31:16,473 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-20 17:31:16,474 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:31:17,636 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-20 17:31:17,636 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:31:18,687 - INFO     | __main__   | Parsing model: llama_13b
2023-04-20 17:31:18,688 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:31:23,464 - INFO     | __main__   | Parsing model: gpt4all
2023-04-20 17:31:23,465 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-20 17:31:25,342 - INFO     | __main__   | Parsing model: llama_7b
2023-04-20 17:31:25,342 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

## Word Embeddings

Creating tokens, word embeddings and averaging them to create one vector per output.

In [9]:
import random
import nltk
import numpy as np

from nltk import word_tokenize
from nltk.corpus import stopwords

from models.embeddings import WordEmbeddings
from helpers.embeddings_helpers import clean_and_tokenize_text

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

STOPWORDS = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lorenzo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lorenzo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# converting text columns to string columns
word_embeddings_df = outputs.copy()
text_cols = outputs.columns[1:]
for col in text_cols:
    word_embeddings_df[col] = word_embeddings_df[col].astype('str')

In [11]:
# creating word level tokens for each model output
for col in model_cols:
    word_embeddings_df[col] = word_embeddings_df[col].map(lambda x: clean_and_tokenize_text(x, tokenizer=word_tokenize, stopwords=STOPWORDS))
display(word_embeddings_df.head(1))
print(f'Tokens: {word_embeddings_df.iloc[0]}')

Unnamed: 0,id,full_text,bloom,alpaca_3b,alpaca_770m,llama_13b,gpt4all,llama_7b
0,1642025557511532545,"the white paws, the cute collar, the tongue, t...","[yes, tweet, potential, lead, tweet, potential...","[yes, tweet, potential, lead, describing, pote...","[yes, tweet, potential, lead, contains, lot, p...","[analysis, tweet, terms, marketing, customer, ...","[answer, yes, tweet, potential, lead, tweet, c...","[yes, reason, characteristics, potential, lead..."


Tokens: id                                           1642025557511532545
full_text      the white paws, the cute collar, the tongue, t...
bloom          [yes, tweet, potential, lead, tweet, potential...
alpaca_3b      [yes, tweet, potential, lead, describing, pote...
alpaca_770m    [yes, tweet, potential, lead, contains, lot, p...
llama_13b      [analysis, tweet, terms, marketing, customer, ...
gpt4all        [answer, yes, tweet, potential, lead, tweet, c...
llama_7b       [yes, reason, characteristics, potential, lead...
Name: 0, dtype: object


### Glove Twitter

In [12]:
glove = WordEmbeddings(name='glove-twitter')

2023-04-20 17:31:29,754 - INFO     | models.embeddings | Initializing GLOVE-TWITTER for Word Embeddings
2023-04-20 17:31:30,062 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/glove-twitter-200/glove-twitter-200.gz
2023-04-20 17:32:41,553 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (1193514, 200) matrix of type float32 from /Users/lorenzo/gensim-data/glove-twitter-200/glove-twitter-200.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-04-20T17:32:41.553818', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [13]:
glove_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    glove_dict[col] = glove.generate_embeddings(word_embeddings_df[col])

2023-04-20 17:32:41,595 - INFO     | __main__   | Parsing model: bloom
2023-04-20 17:32:41,595 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-20 17:32:41,599 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-20 17:32:41,599 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-20 17:32:41,605 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-20 17:32:41,605 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-20 17:32:41,611 - INFO     | __main__   | Parsing model: llama_13b
2023-04-20 17:32:41,611 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-20 17:32:41,622 - INFO     | __main__   | Parsing model: gpt4all
2023-04-20 17:32:41,623 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-20 17:32:41,629 - INFO     | __main__   | Parsing model: llama_7b
2023-04-20 17:32:41,

### Word2Vec Google

In [14]:
word2vec = WordEmbeddings(name='w2v-google')

2023-04-20 17:32:41,662 - INFO     | models.embeddings | Initializing W2V-GOOGLE for Word Embeddings
2023-04-20 17:32:42,049 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2023-04-20 17:33:06,486 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /Users/lorenzo/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-04-20T17:33:06.486119', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [15]:
w2v_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    w2v_dict[col] = word2vec.generate_embeddings(word_embeddings_df[col])

2023-04-20 17:33:06,526 - INFO     | __main__   | Parsing model: bloom
2023-04-20 17:33:06,708 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-20 17:33:06,711 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-20 17:33:06,712 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-20 17:33:06,718 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-20 17:33:06,719 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-20 17:33:06,725 - INFO     | __main__   | Parsing model: llama_13b
2023-04-20 17:33:06,726 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-20 17:33:06,737 - INFO     | __main__   | Parsing model: gpt4all
2023-04-20 17:33:06,738 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-20 17:33:06,745 - INFO     | __main__   | Parsing model: llama_7b
2023-04-20 17:33:06,745 - INFO     

### Glove Wikipedia

In [16]:
wiki = WordEmbeddings(name='glove-wiki')

2023-04-20 17:33:06,787 - INFO     | models.embeddings | Initializing GLOVE-WIKI for Word Embeddings
2023-04-20 17:33:06,929 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2023-04-20 17:33:41,278 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/lorenzo/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-04-20T17:33:41.278205', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [17]:
wiki_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    wiki_dict[col] = wiki.generate_embeddings(word_embeddings_df[col])

2023-04-20 17:33:41,320 - INFO     | __main__   | Parsing model: bloom
2023-04-20 17:33:41,321 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-20 17:33:41,324 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-20 17:33:41,325 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-20 17:33:41,330 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-20 17:33:41,331 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-20 17:33:41,336 - INFO     | __main__   | Parsing model: llama_13b
2023-04-20 17:33:41,336 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-20 17:33:41,347 - INFO     | __main__   | Parsing model: gpt4all
2023-04-20 17:33:41,348 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-20 17:33:41,355 - INFO     | __main__   | Parsing model: llama_7b
2023-04-20 17:33:41,356 - INFO     

### Save Embeddings

In [None]:
SAVE_EMBEDDINGS = False

In [38]:
if SAVE_EMBEDDINGS:
    import pickle

    embeddings_data = {
        'mpnet': mpnet_dict,
        'distil' : distil_dict,
        'glove': glove_dict,
        'wiki': wiki_dict,
        'w2v': w2v_dict,
    }

    for filename, data in embeddings_data.items():   
        with open(f'embeddings/{filename}_embeddings_test.pkl', 'wb') as f:
            pickle.dump(data, f)

# Clustering