In [2]:
import numpy as np
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import re
import pickle
import time 
import os


In [3]:
splits = {'test': 'college_mathematics/test-00000-of-00001.parquet', 'validation': 'college_mathematics/validation-00000-of-00001.parquet', 'dev': 'college_mathematics/dev-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["test"])

In [4]:
df.head()

Unnamed: 0,question,subject,choices,answer
0,Let k be the number of real solutions of the e...,college_mathematics,"[k = 0 and n = 1, k = 1 and n = 0, k = n = 1, ...",1
1,"Up to isomorphism, how many additive abelian g...",college_mathematics,"[0, 1, 2, 3]",3
2,Suppose P is the set of polynomials with coeff...,college_mathematics,"[n = 1 and r = 6, n = 1 and r = 7, n = 2 and r...",3
3,The shortest distance from the curve xy = 8 to...,college_mathematics,"[4, 8, 16, 2sqrt(2)]",0
4,"There are 25 suitcases, 5 of which are damaged...",college_mathematics,"[2/69, 1/30, 2/23, 12/125]",2


In [5]:
def create_prompts_cot(df):
    prompts = []
    answers = []
    for index, row in df.iterrows():
        prompt = "Choose the answer of the given question from the below options.\n"+row['question']+ "\n"+"{1:"+row['choices'][0]+"}" + "\n"+"{2:"+row['choices'][1]+"}" + "\n"+"{3:"+row['choices'][2]+"}"+ "\n"+"{4:"+row['choices'][3]+"}"+"\n" +"Think step by step."+"\n" +"answer: {"
        prompts.append(prompt)
        answers.append(str(row['answer']))
    return prompts, answers

In [6]:
prompts_cot, answers = create_prompts_zs(df)

In [7]:
for i in range(len(answers)):
    answers[i] = int(answers[i])

In [8]:
hf_token = "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"

In [9]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it",use_auth_token = hf_token)
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    use_auth_token = hf_token
)




tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [10]:
model = model.to('cuda')

In [12]:
prompts_cot[8]

'Choose the answer of the given question from the below options.\nLet V be a finite-dimensional real vector space and let P be a linear transformation of V such that P^2 = P. Which of the following must be true?\nI. P is invertible.\nII. P is diagonalizable.\nIII. P is either the identity transformation or the zero transformation.\n{1:None}\n{2:I only}\n{3:II only}\n{4:III only}\nThink step by step.\nanswer: {'

In [18]:
predictions_cot_gemma = []
found = 0
begin = time.time()
for i in range(len(prompts_cot)):
    print("---------------------------"+str(i)+"---------------------------")
    input_text = prompts_cot[i]
    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(**input_ids,max_length = 200)
    response = tokenizer.decode(outputs[0])
    #print(response)
    match = re.search(r"answer: \{(\d+)", response)
    extracted_character = match.group(1) if match else '1'
    if(extracted_character in ['1','2','3','4']):
        extracted_number = int(extracted_character)-1
        found+=1
    else:
        extracted_number = 0
    predictions_cot_gemma.append(extracted_number)
end = time.time()
print(found)

---------------------------0---------------------------
---------------------------1---------------------------
---------------------------2---------------------------
---------------------------3---------------------------
---------------------------4---------------------------
---------------------------5---------------------------
---------------------------6---------------------------
---------------------------7---------------------------
---------------------------8---------------------------
---------------------------9---------------------------
---------------------------10---------------------------
---------------------------11---------------------------
---------------------------12---------------------------
---------------------------13---------------------------
---------------------------14---------------------------
---------------------------15---------------------------
---------------------------16---------------------------
---------------------------17------------

In [19]:
def calculate_accuracy(predictions, answers):
    valid_pairs = [(pred, ans) for pred, ans in zip(predictions, answers) if pred is not None]
    correct_predictions = sum(1 for pred, ans in valid_pairs if pred == ans)
    accuracy = correct_predictions / len(valid_pairs) if valid_pairs else 0
    return accuracy

In [20]:
filename = 'predictions_cot_gemma.pkl'
with open(filename, 'wb') as file:
    pickle.dump(predictions_cot_gemma, file)
with open(filename, 'rb') as file:
    predictions_cot_gemma = pickle.load(file)

In [21]:
print("Chain of thought prompting on Gemma")
print("Total inference time:",end-begin,"seconds")
print("Accuracy score:",100*calculate_accuracy(predictions_cot_gemma,answers),"%")

Chain of thought prompting on Gemma
Total inference time: 230.00375247001648 seconds
Accuracy score: 30.0 %
