In [1]:
import logging
import pandas as pd

logger = logging.getLogger(__name__)

In [2]:
outputs = pd.read_parquet('outputs/0S_100T_all_models_202345.parquet')
outputs.head(1)

Unnamed: 0,id,full_text,bloom,alpaca_3b,alpaca_770m,llama_13b,gpt4all,llama_7b
0,1642025557511532545,"the white paws, the cute collar, the tongue, t...","Yes, the above tweet is a potential lead. The...","Yes, this tweet is a potential lead because it...","Yes, the tweet is a potential lead because it ...",1. What is your analysis of the tweet in terms...,"\n\nAnswer:\n\nYes, the above tweet is a poten...",Yes. The reason is that it has all the charac...


## Embeddings

Testing on ```llama_7b``` outputs.

In [3]:
model_col = 'llama_7b'

### Sentence embeddings

Using sentence embeddings out of the box.

In [4]:
from models.embeddings import SentenceEmbeddings

2023-04-18 16:07:09,958 - INFO     | config     | Loading environment variables


In [5]:
mpnet = SentenceEmbeddings(name='mpnet')

2023-04-18 16:07:11,578 - INFO     | models.embeddings | Initializing MPNET for Sentence Embeddings
2023-04-18 16:07:11,578 - INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-04-18 16:07:12,785 - INFO     | sentence_transformers.SentenceTransformer | Use pytorch device: cpu


In [6]:
sentence_generations = outputs[model_col]
mpnet_embeddings = mpnet.generate_embeddings(input_texts=sentence_generations)

2023-04-18 16:07:12,806 - INFO     | models.embeddings | MPNET - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
print(f'Vector of size {mpnet_embeddings.shape}')
print(f'Examples: {mpnet_embeddings.shape[0]}')
print(f'Embeddings: {mpnet_embeddings.shape[1]}')

Vector of size (100, 768)
Examples: 100
Embeddings: 768


In [8]:
mpnet_embeddings[0][:25]

array([ 0.04991404,  0.01245025, -0.03698337,  0.0051044 , -0.00996805,
        0.04320958, -0.00838856, -0.01815155,  0.00822093, -0.04142584,
        0.01140831, -0.04896618, -0.00486582,  0.09946115,  0.01967317,
        0.01221775,  0.00650522, -0.0036213 ,  0.06752104,  0.03149004,
        0.01869595,  0.04843361, -0.00307424,  0.02203452, -0.03022938],
      dtype=float32)

In [9]:
distilrberta = SentenceEmbeddings(name='distilroberta')

2023-04-18 16:07:20,882 - INFO     | models.embeddings | Initializing DISTILROBERTA for Sentence Embeddings
2023-04-18 16:07:20,883 - INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: all-distilroberta-v1
2023-04-18 16:07:21,616 - INFO     | sentence_transformers.SentenceTransformer | Use pytorch device: cpu


In [10]:
sentence_generations = outputs[model_col]
distil_embeddings = distilrberta.generate_embeddings(input_texts=sentence_generations)

2023-04-18 16:07:21,633 - INFO     | models.embeddings | DISTILROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
print(f'Vector of size {distil_embeddings.shape}')
print(f'# Examples: {distil_embeddings.shape[0]}')
print(f'Embeddings size: {distil_embeddings.shape[1]}')

Vector of size (100, 768)
# Examples: 100
Embeddings size: 768


In [12]:
distil_embeddings[0][:25]

array([-0.03276022, -0.05949922, -0.01047634,  0.04310662,  0.02558028,
        0.11160419, -0.04925827,  0.03003629, -0.00140462,  0.00830909,
        0.01378632, -0.06636973, -0.01438876, -0.02891294, -0.04767688,
       -0.03492078, -0.01642984,  0.01148561,  0.01943938, -0.02421685,
       -0.01196974, -0.02823601, -0.02021303,  0.01467953, -0.01395709],
      dtype=float32)

### Word Embeddings

Creating tokens, word embeddings and averaging them to create one vector per output.

In [13]:
import random
import nltk
import numpy as np

from nltk import word_tokenize
from nltk.corpus import stopwords

from models.embeddings import WordEmbeddings
from helpers.embeddings_helpers import clean_and_tokenize_text

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

STOPWORDS = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lorenzo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lorenzo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# converting text columns to string columns
word_embeddings_df = outputs.copy()
word_embeddings_df[model_col] = word_embeddings_df[model_col].astype('str')

In [15]:
# creating word level tokens for each model output
word_embeddings_df[model_col] = word_embeddings_df[model_col].map(lambda x: clean_and_tokenize_text(x, tokenizer=word_tokenize, stopwords=STOPWORDS))
display(word_embeddings_df[model_col])
print(f'Tokens: {word_embeddings_df[model_col].iloc[0]}')

0     [yes, reason, characteristics, potential, lead...
1     [yes, potential, lead, shows, person, posted, ...
2     [yes, potential, lead, related, marketing, cus...
3     [▪️, yes, nike, made, several, acquisitions, y...
4     [potential, lead, contain, information, used, ...
                            ...                        
95    [tweet, potential, lead, information, person, ...
96    [yes, potential, lead, news, article, decline,...
97    [think, potential, lead, information, product,...
98    [tweet, potential, lead, posted, user, interes...
99    [would, say, related, topic, marketing, custom...
Name: llama_7b, Length: 100, dtype: object

Tokens: ['yes', 'reason', 'characteristics', 'potential', 'lead', 'right', 'combination', 'factors', 'cuteness', 'adorability', 'key', 'factors', 'marketing', 'customer', 'relationship', 'management', 'answer', 'question', 'based', 'context', 'context', 'marketing', 'customer', 'relationship', 'management', 'assistant', 'task', 'classify', 'given', 'tweet', 'either', 'potential', 'lead', 'provide', 'detailed', 'analysis', 'following', 'tweet', 'potential', 'lead', 'context', 'marketing', 'customer', 'relationship', 'management', 'tweet', 'white', 'paws', 'cute', 'collar', 'tongue', 'cute', 'red', 'eyeliner', 'adorable', 'hat', 'small', 'mullet', 'sweet', 'stare', 'pink', 'beans', 'roundness', 'body', 'question', 'tweet', 'potential', 'lead', 'yes', 'answer', 'yes', 'reason', 'characteristics', 'potential', 'lead', 'right', 'combination', 'factors', 'cuteness', 'adorability', 'key', 'factors', 'marketing', 'customer', 'relationship', 'management']


In [16]:
glove = WordEmbeddings(name='glove-twitter')

2023-04-18 16:07:25,963 - INFO     | models.embeddings | Initializing GLOVE-TWITTER for Word Embeddings
2023-04-18 16:07:26,210 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/glove-twitter-200/glove-twitter-200.gz
2023-04-18 16:08:36,184 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (1193514, 200) matrix of type float32 from /Users/lorenzo/gensim-data/glove-twitter-200/glove-twitter-200.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-04-18T16:08:36.184486', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [17]:
glove_embeddings = glove.generate_embeddings(word_embeddings_df[model_col])

2023-04-18 16:08:36,201 - INFO     | models.embeddings | GLOVE-TWITTER - Generating sentence embeddings...


In [18]:
print(f'Vector of size ({len(glove_embeddings)}, {len(glove_embeddings[0])})')
print(f'# Examples: {len(distil_embeddings)}')
print(f'Embeddings size: {len(glove_embeddings[0])}')

Vector of size (100, 200)
# Examples: 100
Embeddings size: 200


In [19]:
glove_embeddings[0][:25]

array([ 0.19305408, -0.09725341,  0.02407458,  0.02520164, -0.01836262,
        0.06644535,  0.56134343, -0.09498324, -0.14953393, -0.17344882,
       -0.06203576, -0.01623033, -0.5874654 , -0.03406139,  0.01704982,
        0.23174308, -0.054517  ,  0.11802665,  0.01858332, -0.14692163,
       -0.13663086,  0.13083495, -0.07296988, -0.07434546, -0.12882087],
      dtype=float32)

In [20]:
word2vec = WordEmbeddings(name='w2v-google')

2023-04-18 16:08:36,290 - INFO     | models.embeddings | Initializing W2V-GOOGLE for Word Embeddings




2023-04-18 16:18:39,512 - INFO     | gensim.downloader | word2vec-google-news-300 downloaded
2023-04-18 16:18:39,515 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2023-04-18 16:19:03,607 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /Users/lorenzo/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-04-18T16:19:03.607136', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [21]:
word2vec_embeddings = word2vec.generate_embeddings(word_embeddings_df[model_col])

2023-04-18 16:19:03,631 - INFO     | models.embeddings | W2V-GOOGLE - Generating sentence embeddings...


In [22]:
print(f'Vector of size ({len(word2vec_embeddings)}, {len(word2vec_embeddings[0])})')
print(f'# Examples: {len(word2vec_embeddings)}')
print(f'Embeddings size: {len(word2vec_embeddings[0])}')

Vector of size (100, 300)
# Examples: 100
Embeddings size: 300


In [23]:
word2vec_embeddings[0][:25]

array([ 0.0322404 ,  0.03132523, -0.02603504,  0.05654375, -0.09165423,
        0.0203185 ,  0.1093035 , -0.11701238,  0.10635021,  0.04933522,
       -0.08406031, -0.07944631, -0.00459405,  0.03160654, -0.10748859,
        0.08056179, -0.01320772,  0.0772627 , -0.06542898, -0.08310859,
       -0.00585938, -0.01770516,  0.02913577,  0.04466567, -0.00073136],
      dtype=float32)

In [24]:
wiki = WordEmbeddings(name='glove-wiki')

2023-04-18 16:19:03,741 - INFO     | models.embeddings | Initializing GLOVE-WIKI for Word Embeddings




2023-04-18 16:21:16,410 - INFO     | gensim.downloader | glove-wiki-gigaword-300 downloaded
2023-04-18 16:21:16,413 - INFO     | gensim.models.keyedvectors | loading projection weights from /Users/lorenzo/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2023-04-18 16:21:50,583 - INFO     | gensim.utils | KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/lorenzo/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-04-18T16:21:50.583536', 'gensim': '4.3.1', 'python': '3.9.13 (main, Mar  3 2023, 13:16:29) \n[Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-13.3.1-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [25]:
wiki_embeddings = wiki.generate_embeddings(word_embeddings_df[model_col])

2023-04-18 16:21:50,601 - INFO     | models.embeddings | GLOVE-WIKI - Generating sentence embeddings...


In [26]:
print(f'Vector of size ({len(wiki_embeddings)}, {len(wiki_embeddings[0])})')
print(f'# Examples: {len(wiki_embeddings)}')
print(f'Embeddings size: {len(wiki_embeddings[0])}')

Vector of size (100, 300)
# Examples: 100
Embeddings size: 300


In [28]:
wiki_embeddings[0][:25]

array([-0.12441022,  0.11495119, -0.02298809, -0.14069755,  0.05530136,
       -0.08360783, -0.08374323,  0.06676266,  0.01220673, -1.1837194 ,
        0.05716083,  0.06649845, -0.17764433,  0.03277824,  0.03594254,
       -0.01795638, -0.0316156 , -0.08364933, -0.06366695,  0.05146493,
       -0.07871727,  0.13870214,  0.10245264,  0.0136757 , -0.31698236],
      dtype=float32)