# Exported from Google Colab

# Imports and Library Installs

In [1]:
import csv
import itertools
import json
from tqdm import tqdm
import os
import ast
import gc
import numpy as np
import pandas as pd
import traceback

import google.colab.output
import google.colab.userdata

In [2]:
pip_output = !pip install minicons@git+https://github.com/JesseTNRoberts/minicons_modded  \
                          accelerate \
                          transformers>=4.43 --upgrade                          # Can remove once Colab default library updates
err_lines = [ln for ln in pip_output if 'error' in ln.lower()]

google.colab.output.clear()

if err_lines:
  print(*err_lines, sep='\n')
else:
  print('Libraries successfully installed')

Libraries successfully installed


In [3]:
import torch
from transformers import AutoModelForMaskedLM, AutoModelForCausalLM
from torch.utils.data import DataLoader

from minicons import scorer

# Mount Drive (optional if already mounted)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Model Loading and Inference

Implements all experiment pipeline methods. All preprocessing, including for APriCoT, occurs in run_all_experiments method. All experiments, including APriCoT implementation, are located in the run_cot_experiments method.

In [5]:
def load_model(model_name : str, causal : bool = True, device : str = 'cuda', verbose=True, token=None):
    load_func = scorer.IncrementalLMScorer if causal else scorer.MaskedLMScorer

    try:
        transformer = load_func(model_name,
                                device=device,
                                torch_dtype=torch.float16,
                                low_cpu_mem_usage=True,
                                device_map="auto",
                                token=token
                               )
        if verbose:
            print(f'Successfully loaded model: {model_name}')
    except:
        if verbose:
            print(f'[WARN] Failed to load model: {model_name}. Retrying with alternate loading procedure.')
        try:
            transformer = load_func(model_name,
                                    device=device,
                                    torch_dtype=torch.float16,
                                    token=token
                                   )
        except Exception as e:
            if verbose:
                print(f'[ERROR] Failed to load model: {model_name}')
            raise e

        if verbose:
            print(f'Successfully loaded model (Alt. Procedure): {model_name}')


    return transformer

In [7]:
def run_cot_experiment(transformer,
                       shared_prompt,
                       options,
                       query_prompt,
                       cot_init = " Let's think step by step. ",
                       canary = None,
                       max_cot_len = 1024,
                       cloze = False,
                       return_cot = False,
                       choice_iterations = 1,
                       agg_function = lambda x: np.exp2(x).mean(),
                       primed = True,
                      ):
    if not primed:
        prompts = [shared_prompt]
    else:
        try:
            prompts = [shared_prompt.replace('{cand_ans}', opt) for opt in options]
        except IndexError as e:
            print(shared_prompt)
            print(options)
            raise e

    prompts = [p + cot_init for p in prompts]

    eos_token_id = transformer.tokenizer.eos_token_id
    pad_token_id = transformer.tokenizer.pad_token_id

    outputs = {}

    if not primed:
        p_t = prompts * choice_iterations

        # CoT generation
        if max_cot_len > 0:
            tokenized = transformer.tokenizer(p_t, return_tensors="pt").input_ids
            tokenized = tokenized.to(transformer.device)
            cot_tokens = transformer.model.generate(
                    tokenized,
                    eos_token_id=eos_token_id,
                    pad_token_id=pad_token_id,
                    max_new_tokens = max_cot_len,)

            cot_outputs = transformer.tokenizer.batch_decode(cot_tokens)
            eos_token_decode = transformer.tokenizer.decode([eos_token_id])
            cot_outputs = [o.replace(eos_token_decode, '') for o in cot_outputs]

            prompts = cot_outputs[:]
        else:
            prompts = p_t

        # Evaluate probability of each option
        for opt in options:
            option_probs = []
            if cloze:
                prompts = [p + query_prompt for p in prompts]
                eval_func = lambda p1, q1: transformer.cloze_score(p1, q1)
                canary = opt
            else:
                prompts = [p + query_prompt.replace('{cand_ans}', opt) for p in prompts]
                eval_func = lambda p1, q1: transformer.conditional_score(p1, q1, reduction=lambda x: x.mean(0).item())

            option_probs = eval_func(prompts, [canary] * len(prompts))

            outputs[opt] = {
                'probs': option_probs,
                'agg_prob': agg_function(option_probs)
            }
            if return_cot:
                outputs[opt]['cot'] = cot_outputs
            torch.cuda.empty_cache()
            gc.collect()

    else:
        for p_t, opt in zip(prompts, options):
            p_t = [p_t] * choice_iterations

            # CoT generation
            if max_cot_len > 0:
                tokenized = transformer.tokenizer(p_t, return_tensors="pt").input_ids
                tokenized = tokenized.to(transformer.device)

                cot_tokens = transformer.model.generate(
                        tokenized,
                        eos_token_id=eos_token_id,
                        pad_token_id=pad_token_id,
                        max_new_tokens = max_cot_len,)

                cot_outputs = transformer.tokenizer.batch_decode(cot_tokens)
                eos_token_decode = transformer.tokenizer.decode([eos_token_id])
                cot_outputs = [o.replace(eos_token_decode, '') for o in cot_outputs]

                prompts = cot_outputs[:]
            else:
                prompts = p_t

            # Evaluate probability of option (only one option per CoT when primed)
            option_probs = []
            if cloze:
                prompts = [p + query_prompt for p in prompts]
                eval_func = lambda p1, q1: transformer.cloze_score(p1, q1)
                canary = opt
            else:
                prompts = [p + query_prompt.replace('{cand_ans}', opt) for p in prompts]
                eval_func = lambda p1, q1: transformer.conditional_score(p1, q1, reduction=lambda x: x.mean(0).item())

            option_probs = eval_func(prompts, [canary] * len(prompts))

            outputs[opt] = {
                'probs': option_probs,
                'agg_prob': agg_function(option_probs)
            }
            if max_cot_len > 0 and return_cot:
                outputs[opt]['cot'] = cot_outputs
            torch.cuda.empty_cache()
            gc.collect()

    return outputs

In [8]:
def run_all_experiments(models, exp_base_dir, experiments, device, debug_EE, mode, additional_label = '', cloze=False, primed=True, max_cot_len=1024, **kwargs):
    for model, lm_type, save_name in models:
        print(f'Running experiments for model {save_name}')
        transformer = load_model(model,
                                causal=lm_type == "incremental",
                                device=device,
                                token=my_token)
        results = {}
        try:
            if mode == 'default':
                raise NotImplementedError
            
            elif mode == 'cot':
                question_template = 'Below you will see a question and answer choices.\n{q_text}\n' + \
                                        'choice A: {ansA}\nchoice B: {ansB}\nchoice C: {ansC}\nchoice D: {ansD}.'

                if max_cot_len > 0:
                    if primed:
                        # APriCoT-specific preprocessing
                        question_template += '\nLet\'s evaluate choice {{cand_ans}} step by step.'
                    else:
                        question_template += '\nLet\'s think step by step.'
                cot_init = ''

                if max_cot_len > 0:
                    if cloze:
                        query_prompt = '\nIn conclusion, which choice do you believe is most correct?\nI believe the correct answer is choice '
                    else:
                        query_prompt = '\nIn conclusion, do you believe choice {cand_ans} is most correct? '
                else:
                    if cloze:
                        query_prompt = '\nWhich choice do you believe is most correct?\nI believe the correct answer is choice '
                    else:
                        query_prompt = '\nDo you believe choice {cand_ans} is most correct? '

                options = ['A', 'B', 'C', 'D']
                canary = 'Yes'

                results = {}
                for (rel_dir, file_name, ds, subject, include_q_text, return_cot) in tqdm(experiments, desc=save_name, position=1):
                    df = pd.read_csv(f'{exp_base_dir}/{rel_dir}/{file_name}', header=None, keep_default_na=False)

                    if debug_EE:
                        questions = df.values.tolist()[:10]
                    else:
                        questions = df.values.tolist()

                    if ds not in results:
                        results[ds] = {}

                    outputs = {}
                    for i, [q_text, ansA, ansB, ansC, ansD, corr] in tqdm(enumerate(questions), desc=subject, position=0, total=len(questions)):
                        outputs[i] = {'correct_answer': corr}
                        if include_q_text:
                            outputs[i]['question_text'] = q_text
                            outputs[i]['answer_A_text'] = ansA
                            outputs[i]['answer_B_text'] = ansB
                            outputs[i]['answer_C_text'] = ansC
                            outputs[i]['answer_D_text'] = ansD

                        question = question_template.format(q_text=q_text, ansA=ansA, ansB=ansB, ansC=ansC, ansD=ansD)

                        outputs[i]['results'] = run_cot_experiment(transformer,
                                                                shared_prompt=question,
                                                                options=options,
                                                                query_prompt=query_prompt,
                                                                cot_init='',
                                                                canary=canary,
                                                                max_cot_len=max_cot_len,
                                                                return_cot=return_cot,
                                                                choice_iterations=10,
                                                                cloze=cloze,
                                                                primed=primed)
                        torch.cuda.empty_cache()
                        gc.collect()
                        if debug_EE:
                            break

                    results[ds][subject] = outputs

                    #Save after every subject to allow some breathing room for failures
                    with open(f'{exp_base_dir}/{save_name}_{additional_label}_cot_results.json', 'w') as f:
                        json.dump(results, f)

                    if debug_EE:
                        break

            if debug_EE:
                break


        except Exception:
            print(traceback.format_exc())
            return

        finally:
            del transformer
            torch.cuda.empty_cache()
            gc.collect()

In [9]:
def run_br_experiments(models, exp_base_dir, experiments, device, additional_label = '', cloze=False):

    for model, lm_type, save_name in models:
        print(f'Running experiments for model {save_name}')
        transformer = load_model(model,
                                causal=lm_type == "incremental",
                                device=device,
                                token=my_token)

        results = {}
        try:
                if cloze:
                    query_prompt = '\nIn conclusion, which choice do you believe is most correct?\nI believe the correct answer is choice '
                else:
                    query_prompt = '\nIn conclusion, do you believe choice {cand_ans} is most correct? '

                options = ['A', 'B', 'C', 'D']
                canary = 'Yes'

                results = {'base_rate': {}}
                for lab1, lab2, lab3, lab4 in tqdm(itertools.permutations('ABCD')):
                    results['base_rate'][lab1 + lab2 + lab3 + lab4] = {'results': {}}
                    question = f'Below you will see a question and answer choices.\nSelect an answer choice\nchoice {lab1}: choice\nchoice {lab2}: choice\nchoice {lab3}: choice\nchoice {lab4}: choice.'

                    results['base_rate'][lab1 + lab2 + lab3 + lab4]['results'] = run_cot_experiment(transformer,
                                                                                 shared_prompt=question,
                                                                                 options=options,
                                                                                 query_prompt=query_prompt,
                                                                                 cot_init='',
                                                                                 canary=canary,
                                                                                 max_cot_len=0,
                                                                                 return_cot=False,
                                                                                 choice_iterations=10,
                                                                                 cloze=cloze,
                                                                                 primed=False)
                    torch.cuda.empty_cache()
                    gc.collect()

                #Save after every subject to allow some breathing room for failures
                with open(f'{exp_base_dir}/{save_name}_{additional_label}_cot_results.json', 'w') as f:
                    json.dump(results, f)

                print(results)

        except Exception:
            print(traceback.format_exc())
            return

        finally:
            del transformer
            torch.cuda.empty_cache()
            gc.collect()

# Experiment Execution

## Load Token(s)

In [10]:
# Loads a huggingface token from secrets (necessary for LLaMa models among
# others). Token must be in secrets. Add huggingface token to secrets and ensure
# that the secret name matches the argument.
my_token = google.colab.userdata.get('hf_token')

## Main Cells

All experiments are initiated from these cells.

In [None]:
# Add any desired models here. Format: (huggingface_name, incremental/masked, model_save_name)
# NOTE: experiment pipeline currently not tested for masked models.
models = [
    ('meta-llama/Meta-Llama-3.1-8B', 'incremental', 'LLaMa3.1-8B',),
]

# Change to desired location. Specified location should include the
# ./MMLU_src_data/orig folder, which contains all MMLU question sets
ebd = '/content/drive/MyDrive/Research/MMLU_cot'


# 'abstract_algebra',                         #100
# 'anatomy',                                  #135
# 'astronomy',                                #152
# 'business_ethics',                          #100
# 'clinical_knowledge',                       #265
# 'college_biology',                          #144
# 'college_chemistry',                        #100
# 'college_computer_science',                 #100
# 'college_mathematics',                      #100
# 'college_medicine',                         #173
# 'college_physics',                          #102
# 'computer_security',                        #100
# 'conceptual_physics',                       #235
# 'econometrics',                             #114
# 'electrical_engineering',                   #145
# 'elementary_mathematics',                   #378
# 'formal_logic',                             #126
# 'global_facts',                             #100
# 'high_school_biology',                      #310
# 'high_school_chemistry',                    #203
# 'high_school_computer_science',             #100
# 'high_school_european_history',             #165
# 'high_school_geography',                    #198
# 'high_school_government_and_politics',      #193
# 'high_school_macroeconomics',               #390
# 'high_school_mathematics',                  #270
# 'high_school_microeconomics',               #238
# 'high_school_physics',                      #151
# 'high_school_psychology',                   #545
# 'high_school_statistics',                   #216
# 'high_school_us_history',                   #204
# 'high_school_world_history',                #237
# 'human_aging',                              #223
# 'human_sexuality',                          #131
# 'international_law',                        #121
# 'jurisprudence',                            #108
# 'logical_fallacies',                        #163
# 'machine_learning',                         #112
# 'management',                               #103
# 'marketing',                                #234
# 'medical_genetics',                         #100
# 'miscellaneous',                            #783
# 'moral_disputes',                           #346
# 'moral_scenarios',                          #895
# 'nutrition',                                #306
# 'philosophy',                               #311
# 'prehistory',                               #324
# 'professional_accounting',                  #282
# 'professional_law',                         #1534
# 'professional_medicine',                    #272
# 'professional_psychology',                  #612
# 'public_relations',                         #110
# 'security_studies',                         #245
# 'sociology',                                #201
# 'us_foreign_policy',                        #100
# 'world_religions',                          #171
# 'virology',                                 #166

experiment_details = []

# Subjects split into 10 approximately equal sized subsets to allow for easy splitting
# and parallelization on separate Colab instances if desired/needed.

# 1534
subjects0 = ['professional_law',]
# #1477
subjects1 = ['college_chemistry',
             'security_studies',
             'logical_fallacies',
             'machine_learning',
             'public_relations',
             'jurisprudence',
             'management',
             'college_physics',
             'college_computer_science',
             'college_mathematics',
             'world_religions',]
# #1395
subjects2 = ['miscellaneous',
             'professional_psychology',]
# #1393
subjects3 = ['high_school_us_history',
             'high_school_chemistry',
             'sociology',
             'high_school_geography',
             'high_school_government_and_politics',
             'college_medicine',
             'international_law',
             'business_ethics',]
#1390
subjects4 = ['professional_accounting',
             'professional_medicine',
             'high_school_mathematics',
             'clinical_knowledge',
             'computer_security',
             'global_facts',
             'high_school_computer_science',]
#1385
subjects5 = ['moral_scenarios',
             'high_school_macroeconomics',
             'abstract_algebra',]
#1384
subjects6 = ['virology',
             'high_school_european_history',
             'astronomy',
             'high_school_physics',
             'electrical_engineering',
             'college_biology',
             'anatomy',
             'formal_logic',
             'medical_genetics',
             'us_foreign_policy',]
#1383
subjects7 = ['high_school_psychology',
             'elementary_mathematics',
             'moral_disputes',
             'econometrics',]
#1383
subjects8 = ['high_school_microeconomics',
             'high_school_world_history',
             'conceptual_physics',
             'marketing',
             'human_aging',
             'high_school_statistics',]
#1382
subjects9 = ['prehistory',
             'philosophy',
             'high_school_biology',
             'nutrition',
             'human_sexuality',]

selected_subjects = [subjects0, subjects1, subjects2,
                subjects3, subjects4, subjects5,
                subjects6, subjects7, subjects8,
                subjects9]

datasets = ['orig']

max_cot_len = 100
subjects_set = sum(selected_subjects, [])

for ds in datasets:
    for sub in subjects_set:
        # (relative dir, file_name, dataset, subject, cloze, include_q_text, include_cot)
        experiment_details.append((f'datasets/{ds}', f'{sub}_test.csv', ds, sub, False, False))

for dir, rfl, _, _, _, _ in experiment_details:
    file_loc = f'{ebd}/{dir}/{rfl}'
    if not os.path.isfile(file_loc):
        raise FileNotFoundError(file_loc)

device = 'cuda'
debug_EE = False


'''
    APriCoT applies whenever primed = True and max_cot_len > 0.
    standard CoT: primed = False, max_cot_len > 0
    no CoT: primed = False, max_cot_len <= 0
    undefined when primed = True and max_cot_len <= 0

    WARNING: cloze test functionality is implemented but not optimized. Memory footprint may be non-viable, even for no_cot experiment.
'''
run_all_experiments(models, ebd, experiment_details, device, debug_EE, 'cot', additional_label='no_cot',  cloze=False, primed=False, max_cot_len=0)
run_all_experiments(models, ebd, experiment_details, device, debug_EE, 'cot', additional_label='cf_cot',  cloze=False, primed=False, max_cot_len=max_cot_len)
run_all_experiments(models, ebd, experiment_details, device, debug_EE, 'cot', additional_label='apricot', cloze=False, primed=True,  max_cot_len=max_cot_len)


## Base-rate Probability Measurement

In [None]:
# Add any desired models here. Format: (huggingface_name, incremental/masked, model_save_name)
# NOTE: experiment pipeline currently not tested for masked models.
models = [
    ('meta-llama/Meta-Llama-3.1-8B', 'incremental', 'LLaMa3.1-8B',),
]

# Change to desired location. Specified location should include the
# ./MMLU_src_data/orig folder, which contains all MMLU question sets
# Must include the base_rate_test.csv file. File contents are only the line:
#   "QUESTION", "CHOICE", "CHOICE", 'CHOICE", "CHOICE", "A"
ebd = '/content/drive/MyDrive/Research/MMLU_cot'

datasets = ['orig']
subjects_set = ['base_rate']

for ds in datasets:
    for sub in subjects_set:
        # (relative dir, file_name, dataset, subject, cloze, include_q_text, include_cot)
        experiment_details.append((f'datasets/{ds}', f'{sub}_test.csv', ds, sub, False, False))

for dir, rfl, _, _, _, _ in experiment_details:
    file_loc = f'{ebd}/{dir}/{rfl}'
    if not os.path.isfile(file_loc):
        raise FileNotFoundError(file_loc)

device = 'cuda'
debug_EE = False

run_br_experiments(models, ebd, experiment_details, device, additional_label = 'debug_cloze_br', cloze=True)