### STUDENT

In [1]:
import os
cache_dir = '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME']=cache_dir
os.environ['HF_HUB_CACHE']=cache_dir+'/hub'

In [2]:
import gc
import torch
from tqdm import tqdm
from datasets import load_from_disk, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_student = load_dataset('json',data_files='../outputs/exp-2.0.1/eval_1/generated_outputs.json')['train']
data_student

Dataset({
    features: ['input', 'output', 'token_ids', 'log_probs', 'all_returned_log_probs', 'model_answer', 'GT_Answer', 'score'],
    num_rows: 1000
})

In [4]:
data_student['output'][0][0]

'To find out how many building blocks can fit into the box, we need to divide the volume of the box by the volume of a single building block.\n\nThe volume of the box is calculated by multiplying its height, width, and length:\nVolume of box = height * width * length = 8 * 10 * 12 = 960 cubic inches\n\nThe volume of a single building block is calculated by multiplying its height, width, and length:\nVolume of building block = height * width * length = 3 * 2 * 4 = 24 cubic inches\n\nNow, we divide the volume of the box by the volume of a single building block to find out how many blocks can fit:\nNumber of blocks = Volume of box / Volume of building block = 960 / 24 = 40\n\nThe final answer is 40.'

In [5]:
# Loading data
data_path= "../datasets/gsm8k/tokenized/LLaMA3B-Instruct/feedback/zero-shot/"
data_tokenized = load_from_disk(data_path)
data_tokenized

Dataset({
    features: ['question', 'answer', 'input_ids'],
    num_rows: 1000
})

In [6]:
data_path = "../datasets/gsm8k/feedback/"
data_gt = load_from_disk(data_path)
data_gt

Dataset({
    features: ['question', 'answer'],
    num_rows: 1000
})

In [7]:
data_gt['answer'][0]

'The volume of the box is 8 x 10 x 12 = <<8*10*12=960>>960 cu in.\nThe volume of a wooden block is 3 x 2 x 4 = <<3*2*4=24>>24 cu in.\n960/24 = <<960/24=40>>40 wooden blocks can fit into the box.\n#### 40'

In [8]:
model_name= "meta-llama/Llama-3.2-3B-Instruct"
padding='longest'
padding_side=None
special_tokens=False
torch_dtype='bfloat16'
hf_token=os.getenv('hf_token')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token, 
        cache_dir=cache_dir
    )
tokenizer.pad_token_id = tokenizer.eos_token_id
device='cuda'
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto", 
    torch_dtype=torch_dtype,
    token=hf_token, 
    cache_dir=cache_dir
)
model.eval()
if padding_side:
        tokenizer.padding_side = padding_side
add_special_tokens = {"add_special_tokens": special_tokens}

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache/models--meta-llama--Llama-3.2-3B-Instruct/.no_exist/0cb88a4f764b7a12671c53f0838cd831a0843b95/adapter_config.json'


In [11]:
batch_size=8
all_outputs=[]

In [12]:
for i in tqdm(range(0,data_student.num_rows,batch_size)):
    
    examples=[]
    questions=[]
    answers=[]
    for j in range(i, min(i + batch_size, data_student.num_rows)):
        question = torch.tensor(data_tokenized['input_ids'][j]['prompt_token_ids'], dtype=torch.long).unsqueeze(0)
        answer = torch.tensor(data_student['token_ids'][j][0], dtype=torch.long).unsqueeze(0)
        examples.append(torch.cat((question, answer), dim=1).squeeze(dim=0))
        questions.append(question)
        answers.append(answer)
        
    # **Pad after concatenation**
    examples =tokenizer.pad(
        {"input_ids": examples},
        padding=True,  # Pads to longest sequence in batch
        return_tensors="pt"  # Convert to PyTorch tensor
    )['input_ids'].to(model.device)

    # print(f'Example Shape:{examples.shape}')
    
    # Forward Pass
    outputs = model(examples)
    probs = torch.log_softmax(outputs.logits, dim=-1).detach()
    # print(f'Logits Shape: {outputs.logits.shape}')
    probs = probs[:, :-1, :]
    examples = examples[:, 1:]
    
    # print(f'Example Shape:{examples.shape}')
    # print(f'Probs Shape: {probs.shape}')
    
    gen_probs = torch.gather(probs, 2, examples[:, :, None]).squeeze(-1)
    # print(f'GenProbs Shape:{gen_probs.shape}')
    
    for j in range(examples.shape[0]):
        # print(f'Question Shape:{questions[j].shape}')
        answer_start_idx = questions[j].shape[1]-1
        answer_end_idx = answer_start_idx + answers[j].shape[1]
        logprobs=[]
        for token, prob in zip(examples[j][answer_start_idx:answer_end_idx], gen_probs[j][answer_start_idx:answer_end_idx]):
            logprobs.append(prob.item())
            # print(f'{token}:\t{prob.item()}\t\t{torch.exp(torch.tensor(prob.item()))}')
        all_outputs.append(
            {
                'prompt':data_student['input'][i+j],
                'gt_reasoning':data_gt['answer'][i+j],
                'gt_answer':data_student['GT_Answer'][i+j],
                'student_token_ids':data_student['token_ids'][i+j][0],
                'student_reasoning':data_student['output'][i+j][0],
                'student_answer':data_student['model_answer'][i+j][0],
                'student_correctness':data_student['score'][i+j],
                'student_log_probs':logprobs
            }
        )
    # Clearing memory to avoid OOM issues
    del examples, outputs, probs, gen_probs, logprobs, questions, answers
    gc.collect()  # Trigger Python's garbage collector
    torch.cuda.empty_cache()  # Free unused GPU memory

  0%|          | 0/125 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 125/125 [03:45<00:00,  1.81s/it]


In [13]:
len(all_outputs)

1000

In [14]:
import json
with open('logprobs.json', "w") as f:
    json.dump(all_outputs, f, indent=4)

### Teacher

In [16]:
# import os
# cache_dir = '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache'
# os.environ['TRANSFORMERS_CACHE'] = cache_dir
# os.environ['HF_HOME']=cache_dir
# os.environ['HF_HUB_CACHE']=cache_dir+'/hub'

In [17]:
# import json
# import gc
# import torch
# from tqdm import tqdm
# from datasets import load_from_disk, load_dataset
# from transformers import AutoTokenizer, AutoModelForCausalLM


In [18]:
# data_student = load_dataset('json',data_files='../outputs/exp-2.0.1/eval_1/generated_outputs.json')['train']
# data_student

In [19]:
# # Loading data
# data_path= "../datasets/gsm8k/tokenized/LLaMA3B-Instruct/feedback/zero-shot/"
# data_tokenized = load_from_disk(data_path)
# data_tokenized

In [9]:
data_teacher = load_dataset('json',data_files='../outputs/exp-2.0.3/eval_1/generated_outputs.json')['train']
data_teacher

Dataset({
    features: ['input', 'output', 'token_ids', 'log_probs', 'all_returned_log_probs', 'model_answer', 'GT_Answer', 'score'],
    num_rows: 1000
})

In [10]:
model_name= "meta-llama/Llama-3.3-70B-Instruct"
padding='longest'
padding_side=None
special_tokens=False
torch_dtype='bfloat16'
hf_token=os.getenv('hf_token')

In [11]:
tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token, 
        cache_dir=cache_dir
    )
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto", 
    torch_dtype=torch_dtype,
    token=hf_token, 
    cache_dir=cache_dir
)
model.eval()
if padding_side:
        tokenizer.padding_side = padding_side
add_special_tokens = {"add_special_tokens": special_tokens}

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache/models--meta-llama--Llama-3.3-70B-Instruct/.no_exist/6f6073b423013f6a7d4d9f39144961bfbfbc386b/adapter_config.json'
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache/models--meta-llama--Llama-3.3-70B-Instruct/.no_exist/6f6073b423013f6a7d4d9f39144961bfbfbc386b/adapter_config.json'
Loading checkpoint shards: 100%|██████████| 30/30 [00:28<00:00,  1.05it/s]


In [12]:
batch_size=1

In [13]:
import json
# Open and read the JSON file
with open('logprobs.json', 'r') as file:
    all_outputs = json.load(file)


In [14]:
len(all_outputs)

1000

In [15]:
for i in tqdm(range(0,data_student.num_rows,batch_size)):
    examples=[]
    questions=[]
    answers=[]
    for j in range(i, min(i + batch_size, data_student.num_rows)):
        question = torch.tensor(data_tokenized['input_ids'][j]['prompt_token_ids'], dtype=torch.long).unsqueeze(0)
        answer = torch.tensor(data_student['token_ids'][j][0], dtype=torch.long).unsqueeze(0)
        examples.append(torch.cat((question, answer), dim=1).squeeze(dim=0))
        questions.append(question)
        answers.append(answer)
        
    # **Pad after concatenation**
    examples =tokenizer.pad(
        {"input_ids": examples},
        padding=True,  # Pads to longest sequence in batch
        return_tensors="pt"  # Convert to PyTorch tensor
    )['input_ids'].to(model.device)

    # print(f'Example Shape:{examples.shape}')
    
    # Forward Pass
    outputs = model(examples)
    probs = torch.log_softmax(outputs.logits, dim=-1).detach()
    # print(f'Logits Shape: {outputs.logits.shape}')
    probs = probs[:, :-1, :]
    examples = examples[:, 1:]
    
    # print(f'Example Shape:{examples.shape}')
    # print(f'Probs Shape: {probs.shape}')
    
    gen_probs = torch.gather(probs, 2, examples[:, :, None]).squeeze(-1)
    # print(f'GenProbs Shape:{gen_probs.shape}')
    
    for j in range(examples.shape[0]):
        # print(f'Question Shape:{questions[j].shape}')
        answer_start_idx = questions[j].shape[1]-1
        answer_end_idx = answer_start_idx + answers[j].shape[1]
        logprobs=[]
        for token, prob in zip(examples[j][answer_start_idx:answer_end_idx], gen_probs[j][answer_start_idx:answer_end_idx]):
            logprobs.append(prob.item())
            # print(f'{token}:\t{prob.item()}\t\t{torch.exp(torch.tensor(prob.item()))}')
        all_outputs[i+j]['teacher_log_probs']=logprobs
        all_outputs[i+j]['teacher_correctness']=data_student['score'][i+j]
        try:
            assert len(all_outputs)==1000
        except:
            print(i+j)
            print(len(all_outputs))
    del examples
    del outputs
    del probs
    del gen_probs
    del logprobs
    del questions
    del answers
    
    gc.collect()  # Trigger Python's garbage collector
    torch.cuda.empty_cache()  # Free unused GPU memory

  0%|          | 0/1000 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 1000/1000 [09:57<00:00,  1.67it/s]


In [17]:
len(all_outputs)

1000

In [18]:
for i in range(len(all_outputs)):
    try:
        assert len(all_outputs[i]['student_log_probs'])==len(all_outputs[i]['teacher_log_probs'])
    except:
        print(i)

In [21]:
import json
with open('logprobs.json', "w") as f:
    json.dump(all_outputs, f, indent=4)

In [25]:
all_outputs[0].keys()

dict_keys(['prompt', 'gt_reasoning', 'gt_answer', 'student_token_ids', 'student_reasoning', 'student_answer', 'student_correctness', 'student_log_probs', 'teacher_log_probs', 'teacher_correctness'])