In [2]:
import logging
import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from models.bloom import Bloom
#from models.gpt4all import GPT4ALL

from config import HUGGINGFACE_TOKEN

logger = logging.getLogger(__name__)

2023-04-04 15:28:06,850 - INFO     | config     | Loading environment variables


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Data

In [4]:
tweets = pd.read_parquet('data/eval_tweets_202342.parquet')
logger.info(f'{len(tweets)} tweets in generation set.')

2023-04-04 15:28:12,100 - INFO     | __main__   | 21711 tweets in generation set.


In [5]:
N_TWEETS = 5
logger.info(f'Generating from {N_TWEETS} tweets.')

2023-04-04 15:28:17,746 - INFO     | __main__   | Generating from 5 tweets.


In [6]:
output_tweets = tweets.iloc[:N_TWEETS].copy()
output_tweets.drop(columns=['created_at', 'entities'], inplace=True)

# Prompt Engineering

In [7]:
PROMPT_TEMPLATE = """Answer the question based on the context below. \
    Context: You are a marketing and customer relationship management assistant, \
    your task is to classify a given tweet as either a \
    potential lead or not. Provide your analysis of the following social media post (tweet) \
    as a potential lead in the context of marketing and customer relationship management. \
    Consider the following factors in your analysis, but feel free to use additional \
    factors as well: \
    - Mentions of product or service offerings, calls to action, inquiries about pricing \
    or availability \
    - Keywords or phrases commonly associated with potential leads in the context of \
    marketing and CRM \
    - The tone and sentiment of the tweet \
    - The author's profile and engagement history on social media \
    - Any relevant contextual factors, such as recent product launches, industry events or \
    trends, or competitor activity. \
    Tweet: {tweet} \
    Question: Is the above tweet a potential lead? Yes or No? Why?. \
    Answer: """

# Models

todo

### BLOOM

In [8]:
bloom = Bloom(hf_api=HUGGINGFACE_TOKEN, temp=1e-10, max_length=256)
bloom.init_prompt(template=PROMPT_TEMPLATE, input_vars=['tweet'])
bloom.count_prompt_tokens()

2023-04-04 15:28:25,351 - INFO     | models.bloom | 
Initializing BLOOM model - Temp: 1e-10 - Context window: 2048 - Max Length: 256
2023-04-04 15:28:26,269 - INFO     | models.bloom | Injecting Variables: ['tweet']
2023-04-04 15:28:26,270 - INFO     | models.bloom | Initializing tokenizer
2023-04-04 15:28:28,700 - INFO     | models.bloom | Prompt tokens len: 252


252

In [10]:
logger.info('Starting BLOOM generation...')
bloom_outs = []
for i, tweet in enumerate(tqdm(output_tweets['full_text'])):
    llm, bloom_out = bloom.generate(inject_obj=tweet)
    bloom_outs.append(bloom_out)
    time.sleep(2)

2023-04-04 15:31:57,977 - INFO     | __main__   | Starting BLOOM generation...


  0%|          | 0/5 [00:00<?, ?it/s]

In [11]:
output_tweets['bloom_out'] = np.array(bloom_outs)

### GPT4All

In [9]:
gpt4all = GPT4ALL(ctx_size=512, n_predict=256)
gpt4all.load_model()

2023-04-03 19:14:50,926 - INFO     | models.gpt4all | 
Initializing GPT4All model - Temp: 1e-10 - Context window: 512 - Threads: 4
2023-04-03 19:14:50,931 - INFO     | models.gpt4all | Loading model...
llama_model_load: ggml ctx size = 4529.35 MB
llama_model_load: memory_size =   512.00 MB, n_mem = 16384
llama_model_load: loading model part 1/1 from '/Users/lorenzo/.nomic/gpt4all-lora-quantized.bin'
llama_model_load: .................................... done
llama_model_load: model size =  4017.27 MB / num tensors = 291


In [10]:
logger.info('Starting GPT4All generation...')
gpt4all_outs = []
for i, tweet in enumerate(tqdm(output_tweets['full_text'])):
    gpt4all.init_prompt(PROMPT_TEMPLATE.format(tweet=tweet))
    gpt4all_out = gpt4all.generate()
    gpt4all_outs.append(gpt4all_out)
    if i % 5 == 0:
        gpt4all.load_model()
    time.sleep(2)

2023-04-03 19:14:55,134 - INFO     | __main__   | Starting GPT4All generation...


  0%|          | 0/100 [00:00<?, ?it/s]

2023-04-03 19:15:06,590 - INFO     | models.gpt4all | Loading model...
llama_model_load: ggml ctx size = 4529.35 MB
llama_model_load: memory_size =   512.00 MB, n_mem = 16384
llama_model_load: loading model part 1/1 from '/Users/lorenzo/.nomic/gpt4all-lora-quantized.bin'
llama_model_load: .................................... done
llama_model_load: model size =  4017.27 MB / num tensors = 291


KeyboardInterrupt: 

In [11]:
gpt4all_outs

['1. The tweet was posted by an individual who has never interacted with any brand before but seems interested in their products/services based on what they have said about them online, and is also active on social media platforms such as Instagram & Twitter. Based upon this information provided, would you classify the given tweeit as a potential lead?']