In [19]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import os
import openai
from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

In [48]:
dataset = load_dataset("boolq")

Found cached dataset boolq (C:/Users/joshn/.cache/huggingface/datasets/boolq/default/0.1.0/bf0dd57da941c50de94ae3ce3cef7fea48c08f337a4b7aac484e9dddc5aa24e5)
100%|██████████| 2/2 [00:00<00:00, 666.24it/s]


In [71]:
dataset_train_subset = dataset['train']
dataset = dataset.shuffle()

dataset_train = dataset['train']
dataset_test = dataset['validation'][:30]

# Get the first 4 trues and 4 falses
dataset_train_true = dataset_train.filter(lambda example: example['answer'] == True).select(range(4)).to_pandas()
dataset_train_false = dataset_train.filter(lambda example: example['answer'] == False).select(range(4)).to_pandas()

# Interleave the two datasets
dataset_train = pd.concat([dataset_train_true, dataset_train_false]).sample(frac=1).reset_index(drop=True)

dataset_train

100%|██████████| 10/10 [00:00<00:00, 128.21ba/s]
100%|██████████| 10/10 [00:00<00:00, 135.13ba/s]


Unnamed: 0,question,answer,passage
0,is cognitive therapy the same as cognitive beh...,False,Cognitive therapy (CT) is a type of psychother...
1,is pittsburgh the only burgh with an h,False,"The name of the city of Pittsburgh, Pennsylvan..."
2,do you have to use cure when making sausage,False,Sausages come in two main types: fresh and cur...
3,are dr martens and doc martens the same,True,Dr. Martens is a British footwear and clothing...
4,is there such a thing as a black eagle,True,"The black eagle is a large but slender eagle, ..."
5,was the movie man on fire a true story,False,Man on Fire is a 2004 British-American crime t...
6,does florida constitution have a bill of rights,True,The first article of the Florida Constitution ...
7,is there going to be a season 5 of jane the ve...,True,"On April 2, 2018, The CW renewed the series fo..."


In [72]:
# Create the prompt
prompt = "".join([f"Question: {question}\nAnswer: {answer}\n" for question, answer in zip(dataset_train['question'], dataset_train['answer'])])
print(prompt)


Question: is cognitive therapy the same as cognitive behavioural therapy
Answer: False
Question: is pittsburgh the only burgh with an h
Answer: False
Question: do you have to use cure when making sausage
Answer: False
Question: are dr martens and doc martens the same
Answer: True
Question: is there such a thing as a black eagle
Answer: True
Question: was the movie man on fire a true story
Answer: False
Question: does florida constitution have a bill of rights
Answer: True
Question: is there going to be a season 5 of jane the vergin
Answer: True



In [51]:
responses = []

for question in tqdm(dataset_test['question']):
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt + f"Question: {question}",
        temperature=0.7,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
      )
    
    responses.append(response.choices[0].text)

100%|██████████| 30/30 [00:13<00:00,  2.20it/s]


In [52]:
# Print out last line of each response
for r in responses[:10]:
    print(r.splitlines()[-1])

Answer: False
Answer: True
Answer: True
Answer: True
Answer: True
Answer: False
Answer: True
Answer: True
Answer: False
Answer: True


In [53]:
answers = [r.endswith("True") for r in responses]

# Calculate accuracy
accuracy = sum([a == b for a, b in zip(answers, dataset_test['answer'])]) / len(answers)

print(f"OpenAI Accuracy: {accuracy}")

OpenAI Accuracy: 0.8


In [54]:
# Now we do the same with BLOOMZ

from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import requests

In [56]:
headers = {
    "Authorization": f"Bearer {os.getenv('HF_API_KEY')}", 
    "Content-Type": "application/json"
}
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloomz"

def query(payload):
    data = json.dumps(payload)
    response = requests.request("POST", API_URL, headers=headers, data=data)
    return json.loads(response.content.decode("utf-8"))

In [66]:
b_dataset_test = dataset['validation'][:100]

b_responses = []
for question in tqdm(b_dataset_test['question']):
    response = query({
        "inputs": prompt + f"Question: {question}",
        "parameters": {
            "temperature": 0.7,
            "max_new_tokens": 1000,
            "frequency_penalty": 0,
            "presence_penalty": 0
        }
    })

    b_responses.append(response[0]['generated_text'])

100%|██████████| 100/100 [00:04<00:00, 21.11it/s]


In [67]:
# Print out last line of each response
for r in b_responses[:10]:
    print(r.splitlines()[-1])

Question: is the first wives club based on a true story False
Question: is the tv show the 100 still on False
Question: is a dogfish part of the shark family False
Question: does maple syrup come out of the tree sweet False
Question: is there a new kung fu panda coming out False
Question: was a real lykan used in furious 7 False
Question: is anne with an e filmed on pei False
Question: is there a state where the drinking age is 18 False
Question: is there such thing as a blue ruby False
Question: can hydrogen be used in a combustion engine False


In [68]:
answers = [r.endswith("True") for r in b_responses]

# Calculate accuracy
accuracy = sum([a == b for a, b in zip(answers, dataset_test['answer'])]) / len(answers)

print(f"Bloomz Accuracy: {accuracy}")

Bloomz Accuracy: 0.11
