In [1]:
import logging
import pandas as pd

logger = logging.getLogger(__name__)

In [2]:
outputs = pd.read_parquet('outputs/0S_100T_all_models_202345.parquet')
outputs.head(1)

Unnamed: 0,id,full_text,bloom,alpaca_3b,alpaca_770m,llama_13b,gpt4all,llama_7b
0,1642025557511532545,"the white paws, the cute collar, the tongue, t...","Yes, the above tweet is a potential lead. The...","Yes, this tweet is a potential lead because it...","Yes, the tweet is a potential lead because it ...",1. What is your analysis of the tweet in terms...,"\n\nAnswer:\n\nYes, the above tweet is a poten...",Yes. The reason is that it has all the charac...


# Embeddings

In [6]:
model_cols = outputs.columns[2:].to_list()
print(f'Models: {model_cols}')

Models: ['bloom', 'alpaca_3b', 'alpaca_770m', 'llama_13b', 'gpt4all', 'llama_7b']


## Sentence embeddings

Using sentence embeddings out of the box.

In [50]:
from models.embeddings import SentenceEmbeddings

sentence_embeddings_df = outputs.copy()

### mpnet

In [51]:
mpnet = SentenceEmbeddings(name='mpnet')

2023-04-18 16:20:28,636 - INFO     | models.embeddings | Initializing MPNET for Sentence Embeddings
2023-04-18 16:20:28,637 - INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-04-18 16:20:29,527 - INFO     | sentence_transformers.SentenceTransformer | Use pytorch device: cpu


In [56]:
mpnet_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    mpnet_dict[col] = mpnet.generate_embeddings(input_texts=sentence_embeddings_df[col])

2023-04-18 16:23:32,775 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:23:32,779 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:23:33,668 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:23:33,668 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:23:35,948 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:23:35,948 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:23:38,186 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:23:38,186 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:23:46,858 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:23:46,859 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:23:50,769 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:23:50,769 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

### distil-roberta

In [58]:
distilrberta = SentenceEmbeddings(name='distil-roberta')

2023-04-18 16:25:28,213 - INFO     | models.embeddings | Initializing DISTILROBERTA for Sentence Embeddings
2023-04-18 16:25:28,213 - INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: all-distilroberta-v1
2023-04-18 16:25:28,880 - INFO     | sentence_transformers.SentenceTransformer | Use pytorch device: cpu


In [59]:
distil_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    distil_dict[col] = distilrberta.generate_embeddings(input_texts=sentence_embeddings_df[col])

2023-04-18 16:25:28,928 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:25:28,928 - INFO     | models.embeddings | DISTILROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:25:29,329 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:25:29,329 - INFO     | models.embeddings | DISTILROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:25:30,460 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:25:30,461 - INFO     | models.embeddings | DISTILROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:25:31,497 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:25:31,498 - INFO     | models.embeddings | DISTILROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:25:36,035 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:25:36,035 - INFO     | models.embeddings | DISTILROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:25:37,949 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:25:37,950 - INFO     | models.embeddings | DISTILROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

## Word Embeddings

Creating tokens, word embeddings and averaging them to create one vector per output.

In [7]:
import random
import nltk
import numpy as np

from nltk import word_tokenize
from nltk.corpus import stopwords

from models.embeddings import WordEmbeddings
from helpers.embeddings_helpers import clean_and_tokenize_text

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

STOPWORDS = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lorenzo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lorenzo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# converting text columns to string columns
word_embeddings_df = outputs.copy()
text_cols = outputs.columns[1:]
for col in text_cols:
    word_embeddings_df[col] = word_embeddings_df[col].astype('str')

In [9]:
# creating word level tokens for each model output
for col in model_cols:
    word_embeddings_df[col] = word_embeddings_df[col].map(lambda x: clean_and_tokenize_text(x, tokenizer=word_tokenize, stopwords=STOPWORDS))
display(word_embeddings_df.head(1))
print(f'Tokens: {word_embeddings_df.iloc[0]}')

Unnamed: 0,id,full_text,bloom,alpaca_3b,alpaca_770m,llama_13b,gpt4all,llama_7b
0,1642025557511532545,"the white paws, the cute collar, the tongue, t...","[yes, tweet, potential, lead, tweet, potential...","[yes, tweet, potential, lead, describing, pote...","[yes, tweet, potential, lead, contains, lot, p...","[analysis, tweet, terms, marketing, customer, ...","[answer, yes, tweet, potential, lead, tweet, c...","[yes, reason, characteristics, potential, lead..."


Tokens: id                                           1642025557511532545
full_text      the white paws, the cute collar, the tongue, t...
bloom          [yes, tweet, potential, lead, tweet, potential...
alpaca_3b      [yes, tweet, potential, lead, describing, pote...
alpaca_770m    [yes, tweet, potential, lead, contains, lot, p...
llama_13b      [analysis, tweet, terms, marketing, customer, ...
gpt4all        [answer, yes, tweet, potential, lead, tweet, c...
llama_7b       [yes, reason, characteristics, potential, lead...
Name: 0, dtype: object


### Glove Twitter

In [10]:
glove = WordEmbeddings(name='glove-twitter')

2023-04-18 16:32:10,495 - INFO     | models.embeddings | Initializing GLOVE-TWITTER for Word Embeddings
2023-04-18 16:32:10,572 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/glove-twitter-200/glove-twitter-200.gz
2023-04-18 16:33:21,290 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (1193514, 200) matrix of type float32 from /Users/lorenzo/gensim-data/glove-twitter-200/glove-twitter-200.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-04-18T16:33:21.290020', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [11]:
glove_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    glove_dict[col] = glove.generate_embeddings(word_embeddings_df[col])

2023-04-18 16:33:21,329 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:33:21,332 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:33:21,340 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:33:21,342 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:33:21,348 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:33:21,349 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:33:21,355 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:33:21,357 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:33:21,371 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:33:21,372 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:33:21,379 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:33:21,

### Word2Vec Google

In [13]:
word2vec = WordEmbeddings(name='w2v-google')

2023-04-18 16:33:21,877 - INFO     | models.embeddings | Initializing W2V-GOOGLE for Word Embeddings
2023-04-18 16:33:21,957 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2023-04-18 16:33:45,700 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /Users/lorenzo/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-04-18T16:33:45.700662', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [14]:
w2v_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    w2v_dict[col] = word2vec.generate_embeddings(word_embeddings_df[col])

2023-04-18 16:34:06,653 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:34:06,654 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:34:06,657 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:34:06,657 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:34:06,662 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:34:06,662 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:34:06,667 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:34:06,668 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:34:06,679 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:34:06,679 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:34:06,690 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:34:06,691 - INFO     

### Glove Wikipedia

In [17]:
wiki = WordEmbeddings(name='glove-wiki')

2023-04-18 16:35:05,089 - INFO     | models.embeddings | Initializing GLOVE-WIKI for Word Embeddings
2023-04-18 16:35:05,163 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2023-04-18 16:35:39,291 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/lorenzo/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-04-18T16:35:39.291208', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [18]:
wiki_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    wiki_dict[col] = wiki.generate_embeddings(word_embeddings_df[col])

2023-04-18 16:35:39,311 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:35:39,312 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:35:39,316 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:35:39,316 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:35:39,321 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:35:39,322 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:35:39,327 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:35:39,327 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:35:39,338 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:35:39,338 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:35:39,346 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:35:39,346 - INFO     