In [1]:
import logging
import pandas as pd

logger = logging.getLogger(__name__)

In [2]:
outputs = pd.read_parquet('outputs/0S_100T_all_models_202345.parquet')
outputs.head(1)

Unnamed: 0,id,full_text,bloom,alpaca_3b,alpaca_770m,llama_13b,gpt4all,llama_7b
0,1642025557511532545,"the white paws, the cute collar, the tongue, t...","Yes, the above tweet is a potential lead. The...","Yes, this tweet is a potential lead because it...","Yes, the tweet is a potential lead because it ...",1. What is your analysis of the tweet in terms...,"\n\nAnswer:\n\nYes, the above tweet is a poten...",Yes. The reason is that it has all the charac...


# Embeddings

In [3]:
model_cols = outputs.columns[2:].to_list()
print(f'Models: {model_cols}')

Models: ['bloom', 'alpaca_3b', 'alpaca_770m', 'llama_13b', 'gpt4all', 'llama_7b']


## Sentence embeddings

Using sentence embeddings out of the box.

In [4]:
from models.embeddings import SentenceEmbeddings

sentence_embeddings_df = outputs.copy()

2023-04-18 16:40:59,127 - INFO     | config     | Loading environment variables


### mpnet

In [5]:
mpnet = SentenceEmbeddings(name='mpnet')

2023-04-18 16:41:00,703 - INFO     | models.embeddings | Initializing MPNET for Sentence Embeddings
2023-04-18 16:41:00,704 - INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-04-18 16:41:01,468 - INFO     | sentence_transformers.SentenceTransformer | Use pytorch device: cpu


In [6]:
mpnet_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    mpnet_dict[col] = mpnet.generate_embeddings(input_texts=sentence_embeddings_df[col])

2023-04-18 16:41:01,486 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:41:01,486 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:02,396 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:41:02,396 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:04,678 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:41:04,679 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:06,852 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:41:06,852 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:15,511 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:41:15,511 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:19,386 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:41:19,386 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

### distil-roberta

In [7]:
distilrberta = SentenceEmbeddings(name='distil-roberta')

2023-04-18 16:41:27,571 - INFO     | models.embeddings | Initializing DISTIL-ROBERTA for Sentence Embeddings
2023-04-18 16:41:27,571 - INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: all-distilroberta-v1
2023-04-18 16:41:28,236 - INFO     | sentence_transformers.SentenceTransformer | Use pytorch device: cpu


In [8]:
distil_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    distil_dict[col] = distilrberta.generate_embeddings(input_texts=sentence_embeddings_df[col])

2023-04-18 16:41:28,252 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:41:28,253 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:28,657 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:41:28,657 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:29,790 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:41:29,790 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:30,789 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:41:30,790 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:35,263 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:41:35,263 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2023-04-18 16:41:37,146 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:41:37,146 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

## Word Embeddings

Creating tokens, word embeddings and averaging them to create one vector per output.

In [9]:
import random
import nltk
import numpy as np

from nltk import word_tokenize
from nltk.corpus import stopwords

from models.embeddings import WordEmbeddings
from helpers.embeddings_helpers import clean_and_tokenize_text

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

STOPWORDS = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lorenzo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lorenzo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# converting text columns to string columns
word_embeddings_df = outputs.copy()
text_cols = outputs.columns[1:]
for col in text_cols:
    word_embeddings_df[col] = word_embeddings_df[col].astype('str')

In [11]:
# creating word level tokens for each model output
for col in model_cols:
    word_embeddings_df[col] = word_embeddings_df[col].map(lambda x: clean_and_tokenize_text(x, tokenizer=word_tokenize, stopwords=STOPWORDS))
display(word_embeddings_df.head(1))
print(f'Tokens: {word_embeddings_df.iloc[0]}')

Unnamed: 0,id,full_text,bloom,alpaca_3b,alpaca_770m,llama_13b,gpt4all,llama_7b
0,1642025557511532545,"the white paws, the cute collar, the tongue, t...","[yes, tweet, potential, lead, tweet, potential...","[yes, tweet, potential, lead, describing, pote...","[yes, tweet, potential, lead, contains, lot, p...","[analysis, tweet, terms, marketing, customer, ...","[answer, yes, tweet, potential, lead, tweet, c...","[yes, reason, characteristics, potential, lead..."


Tokens: id                                           1642025557511532545
full_text      the white paws, the cute collar, the tongue, t...
bloom          [yes, tweet, potential, lead, tweet, potential...
alpaca_3b      [yes, tweet, potential, lead, describing, pote...
alpaca_770m    [yes, tweet, potential, lead, contains, lot, p...
llama_13b      [analysis, tweet, terms, marketing, customer, ...
gpt4all        [answer, yes, tweet, potential, lead, tweet, c...
llama_7b       [yes, reason, characteristics, potential, lead...
Name: 0, dtype: object


### Glove Twitter

In [12]:
glove = WordEmbeddings(name='glove-twitter')

2023-04-18 16:41:41,587 - INFO     | models.embeddings | Initializing GLOVE-TWITTER for Word Embeddings
2023-04-18 16:41:41,675 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/glove-twitter-200/glove-twitter-200.gz
2023-04-18 16:42:52,150 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (1193514, 200) matrix of type float32 from /Users/lorenzo/gensim-data/glove-twitter-200/glove-twitter-200.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-04-18T16:42:52.150463', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [13]:
glove_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    glove_dict[col] = glove.generate_embeddings(word_embeddings_df[col])

2023-04-18 16:42:52,171 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:42:52,172 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:42:52,176 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:42:52,176 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:42:52,182 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:42:52,183 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:42:52,189 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:42:52,189 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:42:52,200 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:42:52,201 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...
2023-04-18 16:42:52,208 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:42:52,

### Word2Vec Google

In [14]:
word2vec = WordEmbeddings(name='w2v-google')

2023-04-18 16:42:52,240 - INFO     | models.embeddings | Initializing W2V-GOOGLE for Word Embeddings
2023-04-18 16:42:52,423 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2023-04-18 16:43:15,707 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /Users/lorenzo/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-04-18T16:43:15.707571', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [15]:
w2v_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    w2v_dict[col] = word2vec.generate_embeddings(word_embeddings_df[col])

2023-04-18 16:43:15,730 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:43:15,794 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:43:15,797 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:43:15,798 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:43:15,804 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:43:15,805 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:43:15,810 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:43:15,810 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:43:15,822 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:43:15,822 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...
2023-04-18 16:43:15,832 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:43:15,835 - INFO     

### Glove Wikipedia

In [16]:
wiki = WordEmbeddings(name='glove-wiki')

2023-04-18 16:43:16,248 - INFO     | models.embeddings | Initializing GLOVE-WIKI for Word Embeddings
2023-04-18 16:43:16,370 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2023-04-18 16:43:50,235 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/lorenzo/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-04-18T16:43:50.235515', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [17]:
wiki_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    wiki_dict[col] = wiki.generate_embeddings(word_embeddings_df[col])

2023-04-18 16:43:50,252 - INFO     | __main__   | Parsing model: bloom
2023-04-18 16:43:50,253 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:43:50,257 - INFO     | __main__   | Parsing model: alpaca_3b
2023-04-18 16:43:50,257 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:43:50,263 - INFO     | __main__   | Parsing model: alpaca_770m
2023-04-18 16:43:50,263 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:43:50,269 - INFO     | __main__   | Parsing model: llama_13b
2023-04-18 16:43:50,269 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:43:50,280 - INFO     | __main__   | Parsing model: gpt4all
2023-04-18 16:43:50,280 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...
2023-04-18 16:43:50,287 - INFO     | __main__   | Parsing model: llama_7b
2023-04-18 16:43:50,287 - INFO     

# Clustering