In [1]:
import random
import re
import time
from collections import Counter
from random import choice, sample
from typing import Any, Dict, List, Tuple

import evaluate
import kscope
import numpy as np
import torch
import torch.nn as nn
from evaluate import EvaluationModule
from tqdm import tqdm
from transformers import AutoTokenizer
from utils import (
    copa_preprocessor,
    create_first_prompt,
    create_mc_prompt,
    create_second_prompt,
    split_prompts_into_batches,
)

random.seed(2024)

# Getting Started

There is a bit of documentation on how to interact with the large models [here](https://kaleidoscope-sdk.readthedocs.io/en/latest/). The relevant github links to the SDK are [here](https://github.com/VectorInstitute/kaleidoscope-sdk) and underlying code [here](https://github.com/VectorInstitute/kaleidoscope).

First we connect to the service through which we'll interact with the LLMs and see which models are avaiable to us

## Establish a client connection to the Kaleidoscope service


In [2]:
client = kscope.Client(gateway_host="llm.cluster.local", gateway_port=3001)

In [3]:
client.models

['gpt2',
 'llama2-7b',
 'llama2-7b_chat',
 'llama2-13b',
 'llama2-13b_chat',
 'llama2-70b',
 'llama2-70b_chat',
 'falcon-7b',
 'falcon-40b',
 'sdxl-turbo']

In [4]:
client.model_instances

[{'id': 'efd66405-d0ef-48d0-8ca0-2eb0c7c8127f',
  'name': 'falcon-7b',
  'state': 'ACTIVE'},
 {'id': 'e64b3690-96c3-4a50-b95f-adcb85e2a42c',
  'name': 'llama2-7b',
  'state': 'ACTIVE'}]

In [5]:
llama_model = client.load_model("llama2-7b")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while llama_model.state != "ACTIVE":
    time.sleep(1)

In [6]:
falcon_model = client.load_model("falcon-7b")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while falcon_model.state != "ACTIVE":
    time.sleep(1)

###  Vote Ensembling

In the [CoPA Prompt Examples](copa_prompting_examples.ipynb) notebook, we looked at a few different ways to get the LLMs to perform the CoPA task. The best way was to evaluate the log probabilities of the candidates as completions, but we also considered a purely generative approach and a multiple choice formulation. In this notebook, we'll see if we can improve our accuracy on the CoPA task by combining each of our approaches through a voting mechanism. For voting, we'll use:
1) LLaMA Generation
2) Falcon Generation
3) LLaMA MC with Bootstrapping (see [Bootstrap Ensembling Notebook](bootstrap_ensembling.ipynb))
4) Falcon MC with Bootstrapping (see [Bootstrap Ensembling Notebook](bootstrap_ensembling.ipynb))
5) LLaMA Log Probability Estimation

Each of these methods will consitute a vote for a particular label and then we'll measure accuracy. 

__Note__ We're doing a lot of generations here. This notebook takes a fair bit of time to run.

In [7]:
all_predictions = []
copa_data_set = copa_preprocessor("resources/copa_sample.tsv")
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
test_pool = copa_data_set[demonstration_candidates:]

### Generation Formulation

In [8]:
# Number of demonstrations to be used per prompt
n_demonstrations = 10
demonstration_pool = copa_data_set[0:demonstration_candidates]
demonstrations = sample(demonstration_pool, n_demonstrations)

prompts: List[str] = []
int_labels: List[int] = []
choices: List[Tuple[str, str]] = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    choices.append((first_choice, second_choice))
    int_labels.append(label)
    prompts.append(create_first_prompt(demonstrations, premise, phrase, first_choice, second_choice))

In [9]:
def process_generation_text(original_texts: List[str]) -> List[str]:
    responses = []
    for single_generation in original_texts:
        generation_text: List[str] = re.findall(r".*?[.!\?]", single_generation)
        response_text = generation_text[0] if len(generation_text) > 0 else single_generation
        responses.append(response_text)
    return responses

In [10]:
# Note that both of these are GREEDY decoding strategies for the different models
llama_generation_config = {"max_tokens": 20, "top_p": 1.0, "temperature": 0.0}
falcon_generation_config = {"max_tokens": 20, "top_k": 1, "temperature": 1.0, "do_sample": False}

In [11]:
llama_responses = []
prompt_batches = split_prompts_into_batches(prompts, 10)
for prompt_batch in tqdm(prompt_batches):
    generations = llama_model.generate(prompt_batch, llama_generation_config)
    llama_responses.extend(process_generation_text(generations.generation["sequences"]))

falcon_responses = []
# Falcon requires a batch size of 8 or less
prompt_batches = split_prompts_into_batches(prompts, 8)
for prompt_batch in tqdm(prompt_batches):
    generations = falcon_model.generate(prompt_batch, falcon_generation_config)
    falcon_responses.extend(process_generation_text(generations.generation["sequences"]))

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:33<00:00,  3.35s/it]
100%|██████████| 13/13 [02:27<00:00, 11.33s/it]


We can perform scoring based on the generated text, by considering the rouge score of the responses using the label as the reference. We choose between the two available choices for the logical completion of the reference phrase. The model has provided a response and we treat each choice as a reference for the ROUGE metric. We take as the model's prediction the phrase with the highest ROUGE score compared to the response text.

In [12]:
rouge_metric = evaluate.load("rouge")

In [13]:
def score_response_via_rouge(
    response: str, first_choice: str, second_choice: str, rouge_metric: EvaluationModule
) -> int:
    response = response.lower()
    first_choice = first_choice.lower()
    second_choice = second_choice.lower()
    # Use the rouge metric to score the response against the first choice or second choice as reference
    rouge_0 = rouge_metric.compute(predictions=[response], references=[first_choice])
    rouge_1 = rouge_metric.compute(predictions=[response], references=[second_choice])
    # We take the average of the unigram and bi-gram rouge scores for the first and second choice results.
    score_0 = (rouge_0["rouge1"] + rouge_0["rouge2"]) / 2.0
    score_1 = (rouge_1["rouge1"] + rouge_1["rouge2"]) / 2.0
    # If the first score is larger we select the first choice
    return 0 if score_0 > score_1 else 1

In [14]:
llama_predictions = []
for response, (first_choice, second_choice) in zip(llama_responses, choices):
    predicted_label = score_response_via_rouge(response, first_choice, second_choice, rouge_metric)
    llama_predictions.append(predicted_label)
all_predictions.append(llama_predictions)

falcon_predictions = []
for response, (first_choice, second_choice) in zip(falcon_responses, choices):
    predicted_label = score_response_via_rouge(response, first_choice, second_choice, rouge_metric)
    falcon_predictions.append(predicted_label)
all_predictions.append(falcon_predictions)

### Multiple Choice Formulation

In [15]:
def process_mc_generation_text(original_texts: List[str]) -> List[str]:
    responses = []
    for single_generation in original_texts:
        generation_text: List[str] = re.findall(r"(A|B)", single_generation)
        # If you find an A or B in the answer use the first occurence. Otherwise randomly select one
        if len(generation_text) == 0:
            print(f"Selecting Randomly. No selection match was found in: {single_generation}")
        response_text = generation_text[0] if len(generation_text) > 0 else choice(["A", "B"])
        responses.append(response_text)
    return responses

In [16]:
# Number of demonstrations to be used per prompt
n_demonstrations = 8

# Note that we use a GREEDY decoding strategies for the model.
llama_generation_config = {"max_tokens": 4, "top_p": 1.0, "temperature": 0.0}

all_llama_responses = []

number_of_voters = 7
for voter_number in range(number_of_voters):
    print(f"Starting MC Choice Response Number: {voter_number + 1}")
    demonstration_pool = copa_data_set[0:demonstration_candidates]
    demonstrations = sample(demonstration_pool, n_demonstrations)
    prompts = []
    choices = []
    for premise, label, phrase, first_choice, second_choice in test_pool:
        choices.append((first_choice, second_choice))
        prompts.append(create_mc_prompt(demonstrations, premise, phrase, first_choice, second_choice))

    llama_responses = []
    prompt_batches = split_prompts_into_batches(prompts, 10)
    for prompt_batch in tqdm(prompt_batches):
        generations = llama_model.generate(prompt_batch, llama_generation_config)
        llama_responses.extend(process_mc_generation_text(generations.generation["sequences"]))

    all_llama_responses.append(llama_responses)

Starting MC Choice Response Number: 1


100%|██████████| 10/10 [00:22<00:00,  2.29s/it]


Starting MC Choice Response Number: 2


100%|██████████| 10/10 [00:23<00:00,  2.30s/it]


Starting MC Choice Response Number: 3


100%|██████████| 10/10 [00:24<00:00,  2.46s/it]


Starting MC Choice Response Number: 4


100%|██████████| 10/10 [00:23<00:00,  2.31s/it]


Starting MC Choice Response Number: 5


100%|██████████| 10/10 [00:24<00:00,  2.41s/it]


Starting MC Choice Response Number: 6


100%|██████████| 10/10 [00:24<00:00,  2.41s/it]


Starting MC Choice Response Number: 7


100%|██████████| 10/10 [00:23<00:00,  2.33s/it]


In [17]:
llama_predictions = []
for response_tuple in zip(*all_llama_responses):
    predicted_label = Counter(response_tuple).most_common(1)[0][0]
    llama_predictions.append(0) if predicted_label == "A" else llama_predictions.append(1)
all_predictions.append(llama_predictions)

In [18]:
# Number of demonstrations to be used per prompt
n_demonstrations = 8

# Note that we use a GREEDY decoding strategies for the model.
falcon_generation_config = {"max_tokens": 4, "top_k": 1, "temperature": 1.0, "do_sample": False}

all_falcon_responses = []

number_of_voters = 7
for voter_number in range(number_of_voters):
    print(f"Starting MC Choice Response Number: {voter_number + 1}")
    demonstration_pool = copa_data_set[0:demonstration_candidates]
    demonstrations = sample(demonstration_pool, n_demonstrations)
    prompts = []
    choices = []
    for premise, label, phrase, first_choice, second_choice in test_pool:
        choices.append((first_choice, second_choice))
        prompts.append(create_mc_prompt(demonstrations, premise, phrase, first_choice, second_choice))

    falcon_responses = []
    prompt_batches = split_prompts_into_batches(prompts, 1)
    for prompt_batch in tqdm(prompt_batches):
        generations = falcon_model.generate(prompt_batch, falcon_generation_config)
        falcon_responses.extend(process_mc_generation_text(generations.generation["sequences"]))

    all_falcon_responses.append(falcon_responses)

Starting MC Choice Response Number: 1


100%|██████████| 100/100 [01:12<00:00,  1.37it/s]


Starting MC Choice Response Number: 2


100%|██████████| 100/100 [01:08<00:00,  1.45it/s]


Starting MC Choice Response Number: 3


100%|██████████| 100/100 [01:12<00:00,  1.39it/s]


Starting MC Choice Response Number: 4


100%|██████████| 100/100 [01:17<00:00,  1.30it/s]


Starting MC Choice Response Number: 5


100%|██████████| 100/100 [01:14<00:00,  1.34it/s]


Starting MC Choice Response Number: 6


100%|██████████| 100/100 [01:11<00:00,  1.40it/s]


Starting MC Choice Response Number: 7


100%|██████████| 100/100 [01:14<00:00,  1.35it/s]


In [19]:
falcon_predictions = []
for response_tuple in zip(*all_falcon_responses):
    predicted_label = Counter(response_tuple).most_common(1)[0][0]
    falcon_predictions.append(0) if predicted_label == "A" else falcon_predictions.append(1)
all_predictions.append(falcon_predictions)

### Log-Likelihood Formulation

In [20]:
# Tokenizer prepares the input of the model. LLaMA models of all sizes use the same underlying tokenizer
tokenizer = AutoTokenizer.from_pretrained("/Users/david/Desktop/LLaMA2_Tokenizer")
# Let's test out how the tokenizer works on an example sentence. Note that the token with ID = 1 is the
# Beginning of sentence token ("BOS")
encoded_tokens = tokenizer.encode("Hello this is a test")
print(f"Encoded Tokens: {encoded_tokens}")
# If you ever need to move back from token ids, you can use tokenizer.decode or tokenizer.batch_decode
decoded_tokens = tokenizer.decode(encoded_tokens)
print(f"Decoded Tokens: {decoded_tokens}")

# We're interested in the activations from the last layer of the model, because this will allow us to calculate the
# likelihoods
last_layer_name = llama_model.module_names[-1]
print(f"Last Layer Name: {last_layer_name}")
# Get a log softmax function to compute log probabilities from the output layer.
log_softmax = nn.LogSoftmax(dim=1)

endline_token_id = tokenizer.encode("Hello\n")[-1]
print(f"Endline Token Id: {endline_token_id}")

Encoded Tokens: [1, 15043, 445, 338, 263, 1243]
Decoded Tokens: <s> Hello this is a test
Last Layer Name: output
Endline Token Id: 13


In [21]:
def compute_log_probability_from_activations(logits: torch.Tensor, token_ids: List[int]) -> float:
    # First we get the logprobs associated with each token, logits is n_tokens x vocabulary size
    log_probs = log_softmax(logits.type(torch.float32))
    # Drop the first token ID (as it corresponds to the <s> token) and add placeholder to the end
    token_ids.pop(0)
    token_ids.append(1)
    # We only really care about the logprobs associated with the sentence to be completed
    # (i.e. not the demonstrations or the question). So search for the last endline in the tokens and only
    # sum the logprobs thereafter.
    endline_index = len(token_ids) - list(reversed(token_ids)).index(endline_token_id)
    # Turn token ids into the appropriate column indices
    token_id_slicer = torch.Tensor(token_ids).reshape(-1, 1).type(torch.int64)
    log_probs_per_token = log_probs.gather(1, token_id_slicer)
    # We sum the log probabilities, except for the last one which corresponds to the as yet predicted token)
    # and then normalize by the number of tokens (minus one for the placeholder)
    selected_log_probs_per_token = log_probs_per_token[endline_index:-1]
    normalized_log_prob = torch.sum(selected_log_probs_per_token) / len(selected_log_probs_per_token)
    return normalized_log_prob.item()

In [22]:
# We're running a lot of activation retrievals. Once in a while there is a json decoding or triton error. If that
# happens, we retry the activations request.
def get_activations_with_retries(prompt: str, layers: List[str], config: Dict[str, Any], retries: int = 5) -> Any:
    for _ in range(retries):
        try:
            return llama_model.get_activations(prompt, layers, config)
        except Exception as e:  # noqa: F841
            print("Something went wrong in activation retrieval...retrying")
    raise ValueError("Exceeded retry limit. Exiting Process")

In [23]:
def pair_prompts_with_choices(prompt_batch: List[Tuple[str, Tuple[str, str]]]) -> List[str]:
    # We want to complete our prompt with the two possible choices and score those completions using our LM.
    prompts_with_choices = []
    for prompt, (first_choice, second_choice) in prompt_batch:
        prompts_with_choices.append(f"{prompt}{first_choice.lower()}")
        prompts_with_choices.append(f"{prompt}{second_choice.lower()}")
    return prompts_with_choices

In [24]:
def post_process_logprobs_to_labels(logprobs: List[float]) -> Tuple[List[int], List[List[float]]]:
    # Need to group logprobs in twos because they represent likelihoods of the two completions
    assert len(logprobs) % 2 == 0
    paired_logprobs = [logprobs[x : x + 2] for x in range(0, len(logprobs), 2)]
    predicted_labels: List[int] = []
    predicted_logprobs = []
    for logprob_pair in paired_logprobs:
        # Paired logprob for first and second choice together
        predicted_labels.append(np.argmax(logprob_pair, axis=0))
        predicted_logprobs.append(logprob_pair)
    return predicted_labels, predicted_logprobs

In [25]:
prompts = []
choices = []
for premise, label, phrase, first_choice, second_choice in test_pool:
    choices.append((first_choice, second_choice))
    prompts.append(create_second_prompt(demonstrations, premise, phrase))

In [26]:
all_logprobs = []
prompts_and_choices = pair_prompts_with_choices(list(zip(prompts, choices)))
# prompts and choices is now twice as long as the original prompts and choices because the prompts have been completed
# with the two possible choices
# We split the prompts into batches of 1 for memory management since activation retrieval is a bit heavy.
prompt_batches = split_prompts_into_batches(prompts_and_choices, 1)
llama_generation_config = {"max_tokens": 1, "top_p": 1.0, "temperature": 0.0}
for prompt_batch in tqdm(prompt_batches):
    # Process below only works for batches of size 1
    assert len(prompt_batch) == 1
    single_prompt = prompt_batch[0]
    # The score for a sentence is the sum of log probability of each word in the sentence.
    prompt_activations = get_activations_with_retries(single_prompt, [last_layer_name], llama_generation_config)  # type: ignore # noqa: E501
    token_ids = tokenizer.encode(single_prompt)
    last_layer_matrix = prompt_activations.activations[0][last_layer_name]
    prompt_log_probs = compute_log_probability_from_activations(last_layer_matrix, token_ids)
    all_logprobs.append(prompt_log_probs)

100%|██████████| 200/200 [11:22<00:00,  3.41s/it]


In [27]:
predicted_labels, _ = post_process_logprobs_to_labels(all_logprobs)
all_predictions.append(predicted_labels)

### Perform the Voting!

At long last, we have all of our predictions from all of our prompts. Now we're going to use them to vote for each example.

In [29]:
total = 0
correct = 0
for prediction_tuples, label in zip(zip(*all_predictions), int_labels):
    majority_prediction = Counter(prediction_tuples).most_common(1)[0][0]
    total += 1
    if majority_prediction == label:
        correct += 1

print(f"Accuracy: {correct/total}")

Accuracy: 0.81


Comparing to any of the results we saw in the other notebooks ([Prompting Examples](copa_prompting_examples.ipynb) and [Bootstrap Examples](bootstrap_ensembling.ipynb)), this is the best accuracy we've gotten and the first time over 80%. This is a nice result, but it should be noted that it was quite "expensive" to generate so many prompts. So the increase in accuracy has to be weighed against that cost.