In [1]:
import re
import time
import random
from random import sample, choice
from typing import List, Tuple, Dict, Any

import evaluate
import kscope
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from evaluate import EvaluationModule
from utils import (
    copa_preprocessor,
    create_first_prompt,
    create_first_prompt_label,
    create_second_prompt,
    create_second_prompt_label,
    split_prompts_into_batches,
    create_mc_prompt,
    create_mc_prompt_answer
)

from transformers import AutoTokenizer

from collections import Counter

# Setting random seed for reproducibility
random.seed(2024)

# Getting Started

There is a bit of documentation on how to interact with the large models [here](https://kaleidoscope-sdk.readthedocs.io/en/latest/). The relevant github links to the SDK are [here](https://github.com/VectorInstitute/kaleidoscope-sdk) and underlying code [here](https://github.com/VectorInstitute/kaleidoscope).

First we connect to the service through which we'll interact with the LLMs and see which models are avaiable to us

## Establish a client connection to the Kaleidoscope service


In [2]:
client = kscope.Client(gateway_host="llm.cluster.local", gateway_port=3001)

In [3]:
client.models


['gpt2',
 'llama2-7b',
 'llama2-7b_chat',
 'llama2-13b',
 'llama2-13b_chat',
 'llama2-70b',
 'llama2-70b_chat',
 'falcon-7b',
 'falcon-40b',
 'sdxl-turbo']

In [4]:
client.model_instances

[{'id': 'efd66405-d0ef-48d0-8ca0-2eb0c7c8127f',
  'name': 'falcon-7b',
  'state': 'ACTIVE'},
 {'id': 'e64b3690-96c3-4a50-b95f-adcb85e2a42c',
  'name': 'llama2-7b',
  'state': 'ACTIVE'}]

In this notebook, we'll actually use two models, The first will be LLaMA-2 7B and the second will be Falcon 7B.

In [5]:
llama_model = client.load_model("llama2-7b")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while llama_model.state != "ACTIVE":
    time.sleep(1)

In [6]:
falcon_model = client.load_model("falcon-7b")
# If this model is not actively running, it will get launched in the background.
# In this case, wait until it moves into an "ACTIVE" state before proceeding.
while falcon_model.state != "ACTIVE":
    time.sleep(1)

### Bootstrap Ensembling for CoPA

In the [CoPA Prompt Examples](copa_prompting_examples.ipynb) notebook, we looked at a few different ways to get the LLMs to perform the CoPA task. The best way was to evaluate the log probabilities of the candidates as completions, but we also considered a purely generative approach and a multiple choice forumulation. In this notebook, we'll see if we might be able to improve the results of the latter two methods with "bootstrap" ensembling.

For a discussion of the CoPA task and the different prompting approach results, see the aforementioned notebook.

We'll term the ensembling technique here "bootstrap" ensembling since we're not going to be using different prompt structures, just different demonstrations or generations.

### Multiple Choice Formulation

We'll use the LLaMA-2 model here, as it seemed to perform better for the MC formulation. We're going to use the same prompt structure, but we're going to generate 7 distinct sets of demonstrations and use the models answers to vote on the final answer.

In [7]:
def process_mc_generation_text(original_texts: List[str]) -> List[str]:
    responses = []
    for single_generation in original_texts:
        generation_text: List[str] = re.findall(r"(A|B)", single_generation)
        # If you find an A or B in the answer use the first occurence. Otherwise randomly select one
        if len(generation_text) == 0:
            print(f"Selecting Randomly. No selection match was found in: {single_generation}")
        response_text = generation_text[0] if len(generation_text) > 0 else choice(["A", "B"])
        responses.append(response_text)
    return responses

In [8]:
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 8

copa_data_set = copa_preprocessor("resources/copa_sample.tsv")
test_pool = copa_data_set[demonstration_candidates:]

# Note that we use a GREEDY decoding strategies for the model.
llama_generation_config = {"max_tokens": 4, "top_p": 1.0, "temperature": 0.0}

all_llama_responses = []

number_of_voters = 7
for voter_number in range(number_of_voters):
    print(f"Starting MC Choice Response Number: {voter_number + 1}")
    demonstration_pool = copa_data_set[0:demonstration_candidates]
    demonstrations = sample(demonstration_pool, n_demonstrations)
    prompts: List[str] = []
    labels: List[str] = []
    int_labels: List[int] = []
    choices: List[Tuple[str, str]] = []
    for premise, label, phrase, first_choice, second_choice in test_pool:
        int_labels.append(label)
        choices.append((first_choice, second_choice))
        labels.append(create_mc_prompt_answer(label))
        prompts.append(create_mc_prompt(demonstrations, premise, phrase, first_choice, second_choice))

    llama_responses = []
    prompt_batches = split_prompts_into_batches(prompts, 10)
    for prompt_batch in tqdm(prompt_batches):
        generations = llama_model.generate(prompt_batch, llama_generation_config)
        llama_responses.extend(process_mc_generation_text(generations.generation["sequences"]))

    all_llama_responses.append(llama_responses)

Starting MC Choice Response Number: 1


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:23<00:00,  2.32s/it]


Starting MC Choice Response Number: 2


100%|██████████| 10/10 [00:23<00:00,  2.34s/it]


Starting MC Choice Response Number: 3


100%|██████████| 10/10 [00:23<00:00,  2.32s/it]


Starting MC Choice Response Number: 4


100%|██████████| 10/10 [00:25<00:00,  2.56s/it]


Starting MC Choice Response Number: 5


100%|██████████| 10/10 [00:22<00:00,  2.28s/it]


Starting MC Choice Response Number: 6


100%|██████████| 10/10 [00:23<00:00,  2.37s/it]


Starting MC Choice Response Number: 7


100%|██████████| 10/10 [00:24<00:00,  2.48s/it]


In [9]:
total = 0
correct = 0

for response_tuple, label in zip(zip(*all_llama_responses), labels):
    predicted_label = Counter(response_tuple).most_common(1)[0][0]
    total+=1
    if predicted_label == label:
        correct +=1

print(f"Accuracy: {correct/total}")

Accuracy: 0.75


We actually get a pretty nice increase in the accuracy here (it was only 0.68 in the other notebook). The quality and distribution of the demonstrations has an impact on how well a model performs a task. By sampling several sets of demonstrations, we were able to improve our results!

### Generation Formulation

We'll use Falcon for this example as an alternative to using LLaMA-2 above. It performed a little better in the previous notebook example as well. In this setting, we're going to use the same set of demonstrations, but we'll allow the model to sample different generation trajectories and see what happens when we use a simple voting strategy to tabulate the prediction. Generation is a bit more "expensive" than the multiple choice setting. So we'll perform 3 distinct generations and see if we can get any improvement.

In [10]:
def process_generation_text(original_texts: List[str]) -> List[str]:
    responses = []
    for single_generation in original_texts:
        generation_text: List[str] = re.findall(r".*?[.!\?]", single_generation)
        response_text = generation_text[0] if len(generation_text) > 0 else single_generation
        responses.append(response_text)
    return responses

In [11]:
# How many of the initial data points should be reserved for demonstrations
demonstration_candidates = 50
# Number of demonstrations to be used per prompt
n_demonstrations = 10
demonstration_pool = copa_data_set[0:demonstration_candidates]
demonstrations = sample(demonstration_pool, n_demonstrations)

# Note that we use a non-greedy decoding strategies for this model to produce variation in the responses.
falcon_generation_config = {"max_tokens": 20, "top_k": 4, "temperature": 0.8, "do_sample": True}

all_falcon_responses = []

number_of_voters = 3
for voter_number in range(number_of_voters):
    print(f"Starting Generative Response Number: {voter_number + 1}")
    prompts: List[str] = []
    labels: List[str] = []
    int_labels: List[int] = []
    choices: List[Tuple[str, str]] = []
    for premise, label, phrase, first_choice, second_choice in test_pool:
        int_labels.append(label)
        choices.append((first_choice, second_choice))
        labels.append(create_first_prompt_label(first_choice.lower(), second_choice.lower(), label))
        prompts.append(create_first_prompt(demonstrations, premise, phrase, first_choice, second_choice))

    falcon_responses = []
    prompt_batches = split_prompts_into_batches(prompts, 1)
    for prompt_batch in tqdm(prompt_batches):
        generations = falcon_model.generate(prompt_batch, falcon_generation_config)
        falcon_responses.extend(process_generation_text(generations.generation["sequences"]))

    all_falcon_responses.append(falcon_responses)

Starting Generative Response Number: 1


100%|██████████| 100/100 [04:10<00:00,  2.50s/it]


Starting Generative Response Number: 2


100%|██████████| 100/100 [04:05<00:00,  2.45s/it]


Starting Generative Response Number: 3


100%|██████████| 100/100 [03:59<00:00,  2.40s/it]


In [12]:
rouge_metric = evaluate.load("rouge")

In [13]:
def score_response_via_rouge(
    response: str, first_choice: str, second_choice: str, rouge_metric: EvaluationModule
) -> int:
    response = response.lower()
    first_choice = first_choice.lower()
    second_choice = second_choice.lower()
    # Use the rouge metric to score the response against the first choice or second choice as reference
    rouge_0 = rouge_metric.compute(predictions=[response], references=[first_choice])
    rouge_1 = rouge_metric.compute(predictions=[response], references=[second_choice])
    # We take the average of the unigram and bi-gram rouge scores for the first and second choice results.
    score_0 = (rouge_0["rouge1"] + rouge_0["rouge2"]) / 2.0
    score_1 = (rouge_1["rouge1"] + rouge_1["rouge2"]) / 2.0
    # If the first score is larger we select the first choice
    return 0 if score_0 > score_1 else 1

In [14]:
total = 0
correct = 0
for falcon_responses, label_int, (first_choice, second_choice) in zip(zip(*all_falcon_responses), int_labels, choices):
    predicted_labels = []
    for response in falcon_responses:
        predicted_label = score_response_via_rouge(response, first_choice, second_choice, rouge_metric)
        predicted_labels.append(predicted_label)
    majority_prediction = Counter(predicted_labels).most_common(1)[0][0]
    total += 1
    if majority_prediction == label_int:
        correct += 1
    if total % 10 == 0:
        print(f"Processed {total} Response Tuples")

print(f"Falcon Accuracy: {correct/total}")

Processed 10 Response Tuples
Processed 20 Response Tuples
Processed 30 Response Tuples
Processed 40 Response Tuples
Processed 50 Response Tuples
Processed 60 Response Tuples
Processed 70 Response Tuples
Processed 80 Response Tuples
Processed 90 Response Tuples
Processed 100 Response Tuples
Falcon Accuracy: 0.64


Unfortunately, our stochastic generation didn't help us improve accuracy in this case. In the previous notebook, we saw an accuracy of 0.68. However, we only produced three families of generations. So it's possible that running additional generations would help.