In [1]:
import logging
import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from models.generation import Model

from config import HUGGINGFACE_TOKEN, GPT4ALL_PATH, LLAMA_7B_PATH, LLAMA_13B_PATH

from helpers.data_helpers import save_to_parquet
from helpers.generation_helpers import generation_loop

logger = logging.getLogger(__name__)

2023-04-05 09:59:04,385 - INFO     | config     | Loading environment variables


In [22]:
# cooldown in seconds every 50 iterations and fast cooldown every iteration
SLOW_COOLDOWN = 120
FAST_COOLDOWN = 5

# Data

In [25]:
tweets = pd.read_parquet('data/eval_tweets_202342.parquet')
logger.info(f'{len(tweets)} tweets in generation set.')

2023-04-04 23:11:17,747 - INFO     | __main__   | 21711 tweets in generation set.


In [26]:
N_TWEETS = 100
logger.info(f'Generating from {N_TWEETS} tweets.')

2023-04-04 23:11:18,222 - INFO     | __main__   | Generating from 100 tweets.


In [27]:
output_tweets = tweets.iloc[:N_TWEETS].copy()
output_tweets.drop(columns=['created_at', 'entities'], inplace=True)

# Prompt Engineering

In [6]:
PROMPT_TEMPLATE = """Answer the question based on the context below. \
    Context: You are a marketing and customer relationship management assistant, \
    your task is to classify a given tweet as either a \
    potential lead or not. Provide your detailed analysis of the following tweet \
    as a potential lead in the context of marketing and customer relationship management. \
    Tweet: {tweet} \
    Question: Is the above tweet a potential lead? Yes or No? Why?. \
    Answer: """

# Models

todo

### BLOOM

In [29]:
bloom = Model(
    model_name='bloom',
    hf_api=HUGGINGFACE_TOKEN
)

2023-04-04 23:11:20,687 - INFO     | models.generation | 
Initializing BLOOM model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 256


In [30]:
bloom.init_prompt(template=PROMPT_TEMPLATE, input_vars=['tweet'])

2023-04-04 23:11:21,405 - INFO     | models.generation | Injecting Variables: ['tweet']


'Answer the question based on the context below.     Context: You are a marketing and customer relationship management assistant,     your task is to classify a given tweet as either a     potential lead or not. Provide your detailed analysis of the following tweet     as a potential lead in the context of marketing and customer relationship management.     Tweet: {tweet}     Question: Is the above tweet a potential lead? Yes or No? Why?.     Answer: '

In [31]:
output_tweets = generation_loop(
    model=bloom,
    model_col='bloom',
    n=N_TWEETS,
    tweets=output_tweets,
    fast_cool=FAST_COOLDOWN,
    slow_cool=SLOW_COOLDOWN  ,
    out_dir='outputs',
    out_name='200T_bloom_alpaca3b_alpaca_11b'
)

2023-04-04 23:11:22,066 - INFO     | helpers.generation_helpers | Starting BLOOM generation...


  0%|          | 0/100 [00:00<?, ?it/s]

2023-04-04 23:15:58,932 - INFO     | helpers.generation_helpers | Step: 50 - Saving checkpoint and cooldown for 2.0m...
2023-04-04 23:15:58,945 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.
2023-04-04 23:20:42,093 - INFO     | helpers.generation_helpers | Step: 100 - Saving checkpoint and cooldown for 2.0m...
2023-04-04 23:20:42,115 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.
2023-04-04 23:20:42,123 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.


### Alpaca Models

#### Alpaca 770M

In [None]:
alpaca_770m = Model(
    model_name='alpaca-770m',
    hf_repo='declare-lab/flan-alpaca-large',
    hf_api=HUGGINGFACE_TOKEN
)

2023-04-05 10:06:37,274 - INFO     | models.generation | 
Initializing ALPACA-770M model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 256


In [None]:
alpaca_770m.init_prompt(template=PROMPT_TEMPLATE, input_vars=['tweet'])

2023-04-05 10:06:37,934 - INFO     | models.generation | Injecting Variables: ['tweet']


'Answer the question based on the context below.     Context: You are a marketing and customer relationship management assistant,     your task is to classify a given tweet as either a     potential lead or not. Provide your detailed analysis of the following tweet     as a potential lead in the context of marketing and customer relationship management.     Tweet: {tweet}     Question: Is the above tweet a potential lead? Yes or No? Why?.     Answer: '

In [None]:
output_tweets = generation_loop(
    model=alpaca_770m,
    model_col='alpaca_770m',
    n=N_TWEETS,
    tweets=output_tweets,
    fast_cool=FAST_COOLDOWN,
    slow_cool=SLOW_COOLDOWN,
    out_dir='outputs',
    out_name='200T_bloom_alpaca3b_alpaca_11b'
)

2023-04-05 10:06:37,993 - INFO     | helpers.generation_helpers | Starting ALPACA-770M generation...


  0%|          | 0/100 [00:00<?, ?it/s]

2023-04-05 10:21:11,091 - INFO     | helpers.generation_helpers | Step: 50 - Saving checkpoint and cooldown for 2.0m...
2023-04-05 10:21:11,106 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.
2023-04-05 10:35:31,468 - INFO     | helpers.generation_helpers | Step: 100 - Saving checkpoint and cooldown for 2.0m...
2023-04-05 10:35:31,491 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.


#### Alpaca 3B

In [32]:
alpaca_3b = Model(
    model_name='alpaca-3b',
    hf_repo='declare-lab/flan-alpaca-xl',
    hf_api=HUGGINGFACE_TOKEN
)

2023-04-04 23:20:42,221 - INFO     | models.generation | 
Initializing ALPACA-3B model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 256


In [33]:
alpaca_3b.init_prompt(template=PROMPT_TEMPLATE, input_vars=['tweet'])

2023-04-04 23:20:42,938 - INFO     | models.generation | Injecting Variables: ['tweet']


'Answer the question based on the context below.     Context: You are a marketing and customer relationship management assistant,     your task is to classify a given tweet as either a     potential lead or not. Provide your detailed analysis of the following tweet     as a potential lead in the context of marketing and customer relationship management.     Tweet: {tweet}     Question: Is the above tweet a potential lead? Yes or No? Why?.     Answer: '

In [34]:
output_tweets = generation_loop(
    model=alpaca_3b,
    model_col='alpaca_3b',
    n=N_TWEETS,
    tweets=output_tweets,
    fast_cool=FAST_COOLDOWN,
    slow_cool=SLOW_COOLDOWN,
    out_dir='outputs',
    out_name='200T_bloom_alpaca3b_alpaca_11b'
)

2023-04-04 23:20:42,972 - INFO     | helpers.generation_helpers | Starting ALPACA-3B generation...


  0%|          | 0/100 [00:00<?, ?it/s]

2023-04-04 23:53:32,946 - INFO     | helpers.generation_helpers | Step: 50 - Saving checkpoint and cooldown for 2.0m...
2023-04-04 23:53:32,963 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.
2023-04-05 00:32:41,856 - INFO     | helpers.generation_helpers | Step: 100 - Saving checkpoint and cooldown for 2.0m...
2023-04-05 00:32:41,876 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.
2023-04-05 00:32:41,882 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.


### GPT4All

In [15]:
gpt4all = Model(
    model_name='gpt4all',
    n_threads=6,
    local_model_path=GPT4ALL_PATH
)

2023-04-04 19:26:22,097 - INFO     | models.generation | 
Initializing GPT4ALL model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 256
llama_model_load: loading model from '.models/gpt4all-7B/gpt4all-converted.bin' - please wait ...
llama_model_load: n_vocab = 32001
llama_model_load: n_ctx   = 2048
llama_model_load: n_embd  = 4096
llama_model_load: n_mult  = 256
llama_model_load: n_head  = 32
llama_model_load: n_layer = 32
llama_model_load: n_rot   = 128
llama_model_load: f16     = 2
llama_model_load: n_ff    = 11008
llama_model_load: n_parts = 1
llama_model_load: type    = 1
llama_model_load: ggml map size = 4017.70 MB
llama_model_load: ggml ctx size =  81.25 KB
llama_model_load: mem required  = 5809.78 MB (+ 2052.00 MB per state)
llama_model_load: loading tensors from '.models/gpt4all-7B/gpt4all-converted.bin'
llama_model_load: model size =  4017.27 MB / num tensors = 291
llama_init_from_file: kv self size  = 2048.00 MB


In [16]:
gpt4all.init_prompt(template=PROMPT_TEMPLATE, input_vars=['tweet'])

2023-04-04 19:26:26,005 - INFO     | models.generation | Injecting Variables: ['tweet']


'Answer the question based on the context below.     Context: You are a marketing and customer relationship management assistant,     your task is to classify a given tweet as either a     potential lead or not. Provide your detailed analysis of the following tweet     as a potential lead in the context of marketing and customer relationship management.     Tweet: {tweet}     Question: Is the above tweet a potential lead? Yes or No? Why?.     Answer: '

In [17]:
output_tweets = generation_loop(
    model=gpt4all,
    model_col='gpt4all',
    n=N_TWEETS,
    tweets=output_tweets,
    fast_cool=FAST_COOLDOWN,
    slow_cool=SLOW_COOLDOWN,
    out_dir='outputs',
    out_name='200T_bloom_alpaca3b_alpaca_11b'
)

2023-04-04 19:27:00,097 - INFO     | __main__   | Starting GPT4All generation...


  0%|          | 0/100 [00:00<?, ?it/s]

2023-04-04 19:41:47,947 - INFO     | helpers.data_helpers | output_tweets.parquet saved.
2023-04-04 19:57:23,399 - INFO     | helpers.data_helpers | output_tweets.parquet saved.


### Llama 7B

In [19]:
llama_7b = Model(
    model_name='llama',
    n_threads=6,
    local_model_path=LLAMA_7B_PATH
)

2023-04-04 20:15:28,928 - INFO     | models.generation | 
Initializing LLAMA model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 256
llama_model_load: loading model from '.models/llama-7B/ggml-model-q4_0.bin' - please wait ...
llama_model_load: n_vocab = 32000
llama_model_load: n_ctx   = 2048
llama_model_load: n_embd  = 4096
llama_model_load: n_mult  = 256
llama_model_load: n_head  = 32
llama_model_load: n_layer = 32
llama_model_load: n_rot   = 128
llama_model_load: f16     = 2
llama_model_load: n_ff    = 11008
llama_model_load: n_parts = 1
llama_model_load: type    = 1
llama_model_load: ggml map size = 4017.70 MB
llama_model_load: ggml ctx size =  81.25 KB
llama_model_load: mem required  = 5809.78 MB (+ 2052.00 MB per state)
llama_model_load: loading tensors from '.models/llama-7B/ggml-model-q4_0.bin'
llama_model_load: model size =  4017.27 MB / num tensors = 291
llama_init_from_file: kv self size  = 2048.00 MB


In [20]:
llama_7b.init_prompt(template=PROMPT_TEMPLATE, input_vars=['tweet'])

2023-04-04 20:15:33,957 - INFO     | models.generation | Injecting Variables: ['tweet']


'Answer the question based on the context below.     Context: You are a marketing and customer relationship management assistant,     your task is to classify a given tweet as either a     potential lead or not. Provide your detailed analysis of the following tweet     as a potential lead in the context of marketing and customer relationship management.     Tweet: {tweet}     Question: Is the above tweet a potential lead? Yes or No? Why?.     Answer: '

In [21]:
output_tweets = generation_loop(
    model=llama_7b,
    model_col='llama_7b',
    n=N_TWEETS,
    tweets=output_tweets,
    fast_cool=FAST_COOLDOWN,
    slow_cool=SLOW_COOLDOWN,
    out_dir='outputs',
    out_name='200T_bloom_alpaca3b_alpaca_11b'
)

2023-04-04 20:15:42,038 - INFO     | __main__   | Starting Llama 7B generation...


  0%|          | 0/100 [00:00<?, ?it/s]

2023-04-04 20:43:13,640 - INFO     | __main__   | 50 - Saving checkpoint...
2023-04-04 20:43:13,653 - INFO     | helpers.data_helpers | output_tweets.parquet saved.
2023-04-04 21:12:21,825 - INFO     | __main__   | 100 - Saving checkpoint...
2023-04-04 21:12:21,832 - INFO     | helpers.data_helpers | output_tweets.parquet saved.


### Llama 13B

In [19]:
llama_13b = Model(
    model_name='llama_13b',
    n_threads=6,
    local_model_path=LLAMA_13B_PATH
)

2023-04-05 10:53:09,501 - INFO     | models.generation | 
Initializing LLAMA_13B model  - Temp: 1e-10 - Context window: 2048 - Max tokens: 256
llama_model_load: loading model from '.models/llama-13B/ggml-model-q4_0.bin' - please wait ...
llama_model_load: n_vocab = 32000
llama_model_load: n_ctx   = 2048
llama_model_load: n_embd  = 5120
llama_model_load: n_mult  = 256
llama_model_load: n_head  = 40
llama_model_load: n_layer = 40
llama_model_load: n_rot   = 128
llama_model_load: f16     = 2
llama_model_load: n_ff    = 13824
llama_model_load: n_parts = 2
llama_model_load: type    = 2
llama_model_load: ggml map size = 7759.83 MB
llama_model_load: ggml ctx size = 101.25 KB
llama_model_load: mem required  = 9807.93 MB (+ 3216.00 MB per state)
llama_model_load: loading tensors from '.models/llama-13B/ggml-model-q4_0.bin'
llama_model_load: model size =  7759.39 MB / num tensors = 363
llama_init_from_file: kv self size  = 3200.00 MB


In [20]:
llama_13b.init_prompt(template=PROMPT_TEMPLATE, input_vars=['tweet'])

2023-04-05 10:53:11,087 - INFO     | models.generation | Injecting Variables: ['tweet']


'Answer the question based on the context below.     Context: You are a marketing and customer relationship management assistant,     your task is to classify a given tweet as either a     potential lead or not. Provide your detailed analysis of the following tweet     as a potential lead in the context of marketing and customer relationship management.     Tweet: {tweet}     Question: Is the above tweet a potential lead? Yes or No? Why?.     Answer: '

In [23]:
output_tweets = generation_loop(
    model=llama_13b,
    model_col='llama_13b',
    n=N_TWEETS,
    tweets=output_tweets,
    fast_cool=FAST_COOLDOWN,
    slow_cool=SLOW_COOLDOWN,
    out_dir='outputs',
    out_name='200T_bloom_alpaca3b_alpaca_11b'
)

2023-04-05 10:53:24,667 - INFO     | helpers.generation_helpers | Starting LLAMA_13B generation...


  0%|          | 0/100 [00:00<?, ?it/s]

2023-04-05 11:39:09,689 - INFO     | helpers.generation_helpers | Step: 50 - Saving checkpoint and cooldown for 2.0m...
2023-04-05 11:39:09,705 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.
2023-04-05 12:24:19,503 - INFO     | helpers.generation_helpers | Step: 100 - Saving checkpoint and cooldown for 2.0m...
2023-04-05 12:24:19,533 - INFO     | helpers.data_helpers | 200T_bloom_alpaca3b_alpaca_11b.parquet saved.
