In [1]:
import logging
import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from models.generation import Model

from config import HUGGINGFACE_TOKEN, GPT4ALL_PATH

logger = logging.getLogger(__name__)

2023-04-04 17:19:08,933 - INFO     | config     | Loading environment variables


# Data

In [2]:
tweets = pd.read_parquet('data/eval_tweets_202342.parquet')
logger.info(f'{len(tweets)} tweets in generation set.')

2023-04-04 17:19:09,015 - INFO     | __main__   | 21711 tweets in generation set.


In [3]:
N_TWEETS = 100
logger.info(f'Generating from {N_TWEETS} tweets.')

2023-04-04 17:19:09,034 - INFO     | __main__   | Generating from 100 tweets.


In [4]:
output_tweets = tweets.iloc[:N_TWEETS].copy()
output_tweets.drop(columns=['created_at', 'entities'], inplace=True)

# Prompt Engineering

In [5]:
PROMPT_TEMPLATE = """Answer the question based on the context below. \
    Context: You are a marketing and customer relationship management assistant, \
    your task is to classify a given tweet as either a \
    potential lead or not. Provide your detailed analysis of the following tweet \
    as a potential lead in the context of marketing and customer relationship management. \
    Tweet: {tweet} \
    Question: Is the above tweet a potential lead? Yes or No? Why?. \
    Answer: """

# Models

todo

### BLOOM

In [6]:
bloom = Model(
    model_name='bloom',
    hf_api=HUGGINGFACE_TOKEN,
    gpt4all_path=GPT4ALL_PATH
)

2023-04-04 17:19:09,112 - INFO     | models.generation | 
Initializing BLOOM model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 256


In [7]:
bloom.init_prompt(template=PROMPT_TEMPLATE, input_vars=['tweet'])

2023-04-04 17:19:09,859 - INFO     | models.generation | Injecting Variables: ['tweet']


'Answer the question based on the context below.     Context: You are a marketing and customer relationship management assistant,     your task is to classify a given tweet as either a     potential lead or not. Provide your analysis of the following social media post (tweet)     as a potential lead in the context of marketing and customer relationship management.     Tweet: {tweet}     Question: Is the above tweet a potential lead? Yes or No? Why?.     Answer: '

In [8]:
logger.info('Starting BLOOM generation...')
bloom_outs = []
for i, tweet in enumerate(tqdm(output_tweets['full_text'])):
    bloom_llm, bloom_out = bloom.generate(inject_obj=tweet)
    bloom_outs.append(bloom_out)
    time.sleep(2)
    if i % 50:
        time.sleep(120)

2023-04-04 17:19:09,888 - INFO     | __main__   | Starting BLOOM generation...


  0%|          | 0/100 [00:00<?, ?it/s]

In [9]:
output_tweets['bloom_out'] = np.array(bloom_outs)

### GPT4All

In [10]:
gpt4all = Model(
    model_name='gpt4all',
    n_threads=6,
    hf_api=HUGGINGFACE_TOKEN,
    gpt4all_path=GPT4ALL_PATH
)

2023-04-04 17:25:10,396 - INFO     | models.generation | 
Initializing GPT4ALL model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 256
llama_model_load: loading model from '/Users/lorenzo/Documents/repos/msc-thesis-llm-clustering/.models/gpt4all-7B/gpt4all-converted.bin' - please wait ...
llama_model_load: n_vocab = 32001
llama_model_load: n_ctx   = 2048
llama_model_load: n_embd  = 4096
llama_model_load: n_mult  = 256
llama_model_load: n_head  = 32
llama_model_load: n_layer = 32
llama_model_load: n_rot   = 128
llama_model_load: f16     = 2
llama_model_load: n_ff    = 11008
llama_model_load: n_parts = 1
llama_model_load: type    = 1
llama_model_load: ggml map size = 4017.70 MB
llama_model_load: ggml ctx size =  81.25 KB
llama_model_load: mem required  = 5809.78 MB (+ 2052.00 MB per state)
llama_model_load: loading tensors from '/Users/lorenzo/Documents/repos/msc-thesis-llm-clustering/.models/gpt4all-7B/gpt4all-converted.bin'
llama_model_load: model size =  4017.27 MB / num tensors

In [11]:
gpt4all.init_prompt(template=PROMPT_TEMPLATE, input_vars=['tweet'])

2023-04-04 17:25:10,958 - INFO     | models.generation | Injecting Variables: ['tweet']


'Answer the question based on the context below.     Context: You are a marketing and customer relationship management assistant,     your task is to classify a given tweet as either a     potential lead or not. Provide your analysis of the following social media post (tweet)     as a potential lead in the context of marketing and customer relationship management.     Tweet: {tweet}     Question: Is the above tweet a potential lead? Yes or No? Why?.     Answer: '

In [12]:
logger.info('Starting GPT4All generation...')
gpt4all_outs = []
for i, tweet in enumerate(tqdm(output_tweets['full_text'])):
    gpt4all_llm, gpt4all_out = gpt4all.generate(tweet)
    gpt4all_outs.append(gpt4all_out)
    time.sleep(2)
    if i % 50:
        time.sleep(120)

2023-04-04 17:25:10,981 - INFO     | __main__   | Starting GPT4All generation...


  0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
output_tweets['gpt4all_out'] = np.array(gpt4all_outs)

In [25]:
from helpers.data_helpers import save_to_parquet
save_to_parquet(data_dir='outputs', df=output_tweets, name='output_tweets')

2023-04-04 17:54:35,683 - INFO     | helpers.data_helpers | output_tweets.parquet saved.
