In [2]:
import logging
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from models.generation import Model

from config import HUGGINGFACE_TOKEN, OPENAI_API_KEY, LLAMA_7B_PATH, GPT4ALL_PATH

from helpers.data_helpers import save_to_parquet
from helpers.generation_helpers import generation_loop

logger = logging.getLogger(__name__)

2023-06-26 08:16:26,883 - INFO     | config     | Loading environment variables


In [3]:
# Global Flags
SLOW_COOLDOWN = 120
FAST_COOLDOWN = 10

N_TWEETS = 1000

# DATA Import

In [4]:
tweets = pd.read_parquet('data/eval_tweets_202342.parquet')
logger.info(f'{len(tweets)} tweets in generation set.')
logger.info(f'Generating from {N_TWEETS} tweets.')

2023-06-25 20:08:16,012 - INFO     | __main__   | 21711 tweets in generation set.
2023-06-25 20:08:16,013 - INFO     | __main__   | Generating from 1000 tweets.


In [5]:
# Select tweets
output_tweets = tweets.iloc[:N_TWEETS].copy()
output_tweets.drop(columns=['created_at', 'entities'], inplace=True)

In [6]:
output_tweets

Unnamed: 0,id,full_text
0,1642025557511532545,"the white paws, the cute collar, the tongue, t..."
1,1642522139130486787,city boy. see those sneakers abeg. the way he ...
2,1642165846842966016,put it on a flame/heat safe surface and burn/...
3,1642505628181569538,7| acquisitions: nike has made several acquisi...
4,1642367629020266496,before: bilas air hangat aftercare: moisturizer
...,...,...
995,1641256201404186624,flowers themselves are $15-$25 and above.
996,1642436855739154436,i don't know much about shea butter but i kno...
997,1642545614427914244,thank you maybelline #maybellinexbini #bini_f...
998,1642342056637915141,super excited to finally get this out. the mor...


# Prompt Engineering

In [5]:
PROMPT_TEMPLATE = """Answer the question based on the context below. \
    Context: You are a marketing and customer relationship management assistant, \
    your task is to classify a given tweet as either a \
    potential lead or not. Provide your detailed analysis of the following tweet \
    as a potential lead in the context of marketing and customer relationship management. 
    Answer with less than 100 words. \
    Tweet: "{tweet}" \
    Question: Is the above tweet a potential lead? Yes or No? Why?.
    Answer: """

# Models

In [6]:
# OpenAI Model
openai = Model(model_name='openai',
               openai_api=OPENAI_API_KEY,
               openai_model='gpt-3.5-turbo',
               max_tokens=500)

# injecting prompts
openai.init_prompt(template=PROMPT_TEMPLATE,
                   input_vars=['tweet'])

# generation
output_tweets = generation_loop(model=openai,
                           model_col='gpt-3.5-turbo',
                           n=N_TWEETS,
                           tweets=output_tweets,
                           fast_cool=FAST_COOLDOWN,
                           slow_cool=SLOW_COOLDOWN,
                           out_dir='outputs',
                           out_name='200T')

2023-06-17 14:26:19,836 - INFO     | models.generation | 
Initializing OPENAI model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 500
2023-06-17 14:26:19,847 - INFO     | helpers.generation_helpers | Starting OPENAI generation...


  0%|          | 0/1000 [00:00<?, ?it/s]

2023-06-17 14:29:23,784 - INFO     | openai     | error_code=None error_message='The server had an error while processing your request. Sorry about that!' error_param=None error_type=server_error message='OpenAI API error received' stream_error=False
2023-06-17 14:38:15,013 - INFO     | helpers.generation_helpers | Step: 50 - Saving checkpoint and cooldown for 2.0m...
2023-06-17 14:38:15,018 - INFO     | helpers.data_helpers | 200T.parquet saved.
2023-06-17 14:44:35,263 - INFO     | openai     | error_code=None error_message='That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 4fba65ae2c365291e58dfeb7d454a372 in your message.)' error_param=None error_type=server_error message='OpenAI API error received' stream_error=False
2023-06-17 14:49:56,441 - INFO     | helpers.generation_helpers | Step: 100 - Saving checkpoint and cooldown for 2.0m...
2023

In [7]:
# Alpaca Model from Stanford, based on LLAMA
alpaca = Model(model_name='alpaca-770M',
               hf_api=HUGGINGFACE_TOKEN,
               hf_repo='declare-lab/flan-alpaca-large',
               max_tokens=500)

# injecting prompt
alpaca.init_prompt(template=PROMPT_TEMPLATE,
                   input_vars=['tweet'])

# generation
output_tweets = generation_loop(model=alpaca,
                                model_col='alpaca',
                                n=N_TWEETS,
                                tweets=output_tweets,
                                fast_cool=FAST_COOLDOWN,
                                slow_cool=SLOW_COOLDOWN,
                                out_dir='outputs',
                                out_name='200T')

2023-06-17 18:30:16,907 - INFO     | models.generation | 
Initializing ALPACA-770M model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 500
2023-06-17 18:30:18,304 - INFO     | helpers.generation_helpers | Starting ALPACA-770M generation...


  0%|          | 0/1000 [00:00<?, ?it/s]

2023-06-17 18:39:06,487 - INFO     | helpers.generation_helpers | Step: 50 - Saving checkpoint and cooldown for 2.0m...
2023-06-17 18:39:06,507 - INFO     | helpers.data_helpers | 200T.parquet saved.
2023-06-17 18:47:53,303 - INFO     | helpers.generation_helpers | Step: 100 - Saving checkpoint and cooldown for 2.0m...
2023-06-17 18:47:53,328 - INFO     | helpers.data_helpers | 200T.parquet saved.
2023-06-17 18:58:54,265 - INFO     | helpers.generation_helpers | Step: 150 - Saving checkpoint and cooldown for 2.0m...
2023-06-17 18:58:54,289 - INFO     | helpers.data_helpers | 200T.parquet saved.
2023-06-17 19:13:12,072 - INFO     | helpers.generation_helpers | Step: 200 - Saving checkpoint and cooldown for 2.0m...
2023-06-17 19:13:12,089 - INFO     | helpers.data_helpers | 200T.parquet saved.
2023-06-17 19:27:57,822 - INFO     | helpers.generation_helpers | Step: 250 - Saving checkpoint and cooldown for 2.0m...
2023-06-17 19:27:57,846 - INFO     | helpers.data_helpers | 200T.parquet sav

KeyboardInterrupt: 

In [None]:
# Base LLAMA Model
llama = Model(model_name='llama',
              n_threads=6,
              local_model_path=LLAMA_7B_PATH,
              max_tokens=500)

# injecting prompt
llama.init_prompt(template=PROMPT_TEMPLATE,
                  input_vars=['tweet'])

# generation
output_tweets = generation_loop(model=llama,
                                model_col='llama',
                                n=N_TWEETS,
                                tweets=output_tweets,
                                fast_cool=FAST_COOLDOWN,
                                slow_cool=SLOW_COOLDOWN,
                                out_dir='outputs',
                                out_name='200T')

In [None]:
# GPT4ALL, Fine-Tuned with OpenAI's model Q&As
gpt4all = Model(model_name='gpt4all',
                n_threads=6,
                local_model_path=GPT4ALL_PATH,
                max_tokens=500)

# injecting prompt
gpt4all.init_prompt(template=PROMPT_TEMPLATE,
                    input_vars=['tweet'])

# generation
output_tweets = generation_loop(model=gpt4all,
                                model_col='gpt4all',
                                n=N_TWEETS,
                                tweets=output_tweets,
                                fast_cool=FAST_COOLDOWN,
                                slow_cool=SLOW_COOLDOWN,
                                out_dir='outputs',
                                out_name='200T')

# Embeddings

In [5]:
# Load tweets and generations
df = pd.read_parquet('outputs/1000T_2023617.parquet')

# Select model columns
model_cols = df.columns[2:]
model_cols

Index(['gpt-3.5-turbo', 'alpaca'], dtype='object')

In [6]:
from models.embeddings import SentenceEmbeddings

# Using distil-roberta for the sentence embeddings
distilrberta = SentenceEmbeddings(name='distil-roberta')

2023-06-26 08:18:22,990 - INFO     | models.embeddings | Initializing DISTIL-ROBERTA for Sentence Embeddings
2023-06-26 08:18:22,991 - INFO     | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: all-distilroberta-v1
2023-06-26 08:18:23,864 - INFO     | sentence_transformers.SentenceTransformer | Use pytorch device: cpu


In [7]:
# generating embeddings
distil_dict = {}
for col in model_cols:
    logger.info(f'Parsing model: {col}')
    distil_dict[col] = distilrberta.generate_embeddings(input_texts=df[col])

2023-06-26 08:18:27,090 - INFO     | __main__   | Parsing model: gpt-3.5-turbo
2023-06-26 08:18:27,090 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

2023-06-26 08:18:39,467 - INFO     | __main__   | Parsing model: alpaca
2023-06-26 08:18:39,468 - INFO     | models.embeddings | DISTIL-ROBERTA - Generating sentence embeddings...


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [23]:
# insert embeddings in dataframe
for col in model_cols:
    embeddings_col = col + '_embeddings'
    temp = pd.DataFrame({embeddings_col: distil_dict[col].tolist()})
    df[embeddings_col] = temp[embeddings_col].copy()

df.head()

Unnamed: 0,id,full_text,gpt-3.5-turbo,alpaca,gpt-3.5-turbo_embeddings,alpaca_embeddings
0,1642025557511532545,"the white paws, the cute collar, the tongue, t...","No, the above tweet is not a potential lead. T...","Yes, the tweet is a potential lead because it ...","[-0.060498107224702835, -0.06838816404342651, ...","[-0.02545035257935524, -0.11641273647546768, -..."
1,1642522139130486787,city boy. see those sneakers abeg. the way he ...,"No, the above tweet is not a potential lead. T...","Yes, the tweet is a potential lead because it ...","[-0.0260239876806736, -0.08790598064661026, -0...","[-0.02217881567776203, -0.10774783790111542, -..."
2,1642165846842966016,put it on a flame/heat safe surface and burn/...,"No, the above tweet is not a potential lead. I...","Yes, this tweet is a potential lead because it...","[-0.028097709640860558, -0.09122901409864426, ...","[-0.032287366688251495, -0.08707711845636368, ..."
3,1642505628181569538,7| acquisitions: nike has made several acquisi...,"No, the above tweet is not a potential lead. W...","Yes, the tweet is a potential lead because it ...","[-0.012716451659798622, -0.10650305449962616, ...","[-0.007112530060112476, -0.09131787717342377, ..."
4,1642367629020266496,before: bilas air hangat aftercare: moisturizer,"No, the above tweet is not a potential lead in...","Yes, this tweet is a potential lead because it...","[-0.029857849702239037, -0.1002630740404129, -...","[-0.02748551405966282, -0.11469537019729614, -..."


In [25]:
# save full dataframe with embeddings and embeddings also separately
save_to_parquet(data_dir='.', df=df, name='full_data')

import pickle

with open(f'embeddings/1000T_embeddings_2023617.pkl', 'wb') as f:
    pickle.dump(distil_dict, f)

2023-06-26 08:33:14,478 - INFO     | helpers.data_helpers | full_data.parquet saved.


# Clustering