In [1]:
import os
cache_dir = '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME']=cache_dir
os.environ['HF_HUB_CACHE']=cache_dir+'/hub'

In [2]:
import torch
import json
from transformers import AutoTokenizer

from datasets import load_from_disk





In [4]:
# Loading model
hf_token = os.getenv("hf_token")
hf_name='Llama-3.3-70B-Instruct'
model_name= f"meta-llama/{hf_name}"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id



In [20]:
def get_prompt(ex, prompt_template, task_prompt, few_shot, few_shot_examples, input_col, output_col, n=3):
    prompt=[
        {
            'role':'system',
            'content':prompt_template['system_msg']
        }
    ]
    if few_shot:
        for idx in range(n):
            prompt.extend([
                {
                    'role':'user',
                    'content':prompt_template['user_msg'].format(instruction=task_prompt, question=few_shot_examples[idx][input_col])
                },
                {
                    'role':'assistant',
                    'content':prompt_template['assistant_msg'].format(response=few_shot_examples[idx][output_col], rationale=few_shot_examples[idx]['rationale'])
                }
                                                                      
            ])
        
    prompt.append(
        {
            'role':'user',
            'content':prompt_template['user_msg'].format(instruction=task_prompt, question=ex)
        }
    )
    
    return prompt

def tokenize_function(example,input_col, output_col, prompt_template, task_prompt, few_shot, few_shot_examples, n=3):
    prompt= get_prompt(example[input_col], prompt_template, task_prompt, few_shot, few_shot_examples, input_col, output_col, n)
    prompt= tokenizer.apply_chat_template(prompt,  tokenize= False, add_generation_prompt=True)
    return {'input_ids': {'prompt_token_ids':tokenizer(prompt, add_special_tokens=False)['input_ids']}}


In [21]:
def tokenize_data(task_name, split, input_col, output_col, prompt_template, task_prompt, few_shot, few_shot_examples, n):
    data_path= f'../datasets/{task_name}'
    data = load_from_disk(f"{data_path}/raw/{split}/")
    tokenized_dataset = data.map(lambda x: tokenize_function(x, input_col, output_col, prompt_template, task_prompt, few_shot, few_shot_examples, n), batched=False)
    output_path=f"{data_path}/tokenized/{hf_name}/{split}/{n}-shot/"
    print(output_path)
    # tokenized_dataset.save_to_disk(output_path)
    return

In [27]:
input_col='question'
output_col='answer'
for task in ['gsm8k']:
    task_prompt_path=f'../prompts/{task}.json'
    with open(task_prompt_path) as fp:
        task_prompt = json.load(fp)
    prompt_template={
        'system_msg':task_prompt['system_msg'],
        'user_msg':task_prompt['user_msg'],
        'assistant_msg':task_prompt['assistant_msg']
    }
    for split in ['val','test',]:
    # for split in ['feedback-100','feedback-400','feedback-1600']:
        for few_shot in [True]:
            if few_shot:
                for n in [8]:
                    tokenize_data(task, split, input_col, output_col, prompt_template, task_prompt['task_prompt'], few_shot, task_prompt['few_shot'], n)
            else:
                tokenize_data(task, split, input_col, output_col, prompt_template, task_prompt['task_prompt'], few_shot, task_prompt['few_shot'], 0)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Given the following problem, reason and give a final answer to the problem.
Problem: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
Your response should end with "The final answer is [answer]" where [answer] is the response to the problem.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The final answer is 6<|eot_id|><|start_header_id|>user<|end_header_id|>

Given the following problem, reason and give a final answer to the problem.
Problem: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
Your response 

NameError: name 'K' is not defined

In [17]:
# Loading data
task='gsm8k'
input_column='input'
data_path= f'../datasets/{task}'
split='test'
data = load_from_disk(f"{data_path}/raw/{split}/")
data

Dataset({
    features: ['question', 'answer'],
    num_rows: 1319
})

In [8]:
task_prompt_path=f'../prompts/{task}.json'
with open(task_prompt_path) as fp:
    task_prompt = json.load(fp)

prompt_template={
    'system_msg':task_prompt['system_msg'],
    'user_msg':task_prompt['user_msg'],
    'assistant_msg':task_prompt['assistant_msg']
}

In [9]:
task_prompt

{'task_prompt': 'Correct grammatical errors in the text by first providing a response, followed by an explanation. Please use this template for the explanation: "The word X should be deleted/inserted/replaced by Y because ..."',
 'task_prompt1': 'Given an input text, the goal is to detect and correct grammatical errors in the text. First explain your reasoning by describing the grammatical errors and how to fix them and then, provide the corrected text.\n\nYour response should end with "The corrected text is: [answer]" where [answer] is the grammatically correct version of the input text.',
 'system_msg': '',
 'user_msg': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{question}\n\n### Response:\n',
 'assistant_msg': '{rationale} The corrected text is {response}\n',
 'few_shot': [{'id': '8778',
   'input': 'way to move from pl

In [10]:
print(prompt_template['assistant_msg'].format(response='0',rationale='0'))

0 The corrected text is 0



In [11]:
input_column='input'
few_shot=False


In [12]:
prompt=get_prompt(data['input'][0], prompt_template, task_prompt['task_prompt'], few_shot, task_prompt['few_shot'], n=2)
prompt

[{'role': 'system', 'content': ''},
 {'role': 'user',
  'content': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nCorrect grammatical errors in the text by first providing a response, followed by an explanation. Please use this template for the explanation: "The word X should be deleted/inserted/replaced by Y because ..."\n\n### Input:\nKeeping the Secret of Genetic Testing\n\n### Response:\n'}]

In [13]:
prompt=tokenizer.apply_chat_template(prompt,  tokenize= False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Correct grammatical errors in the text by first providing a response, followed by an explanation. Please use this template for the explanation: "The word X should be deleted/inserted/replaced by Y because ..."

### Input:
Keeping the Secret of Genetic Testing

### Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [14]:
prompt=tokenizer.decode(tokenizer(prompt)['input_ids'],skip_special_tokens=True)
print(prompt)

system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Correct grammatical errors in the text by first providing a response, followed by an explanation. Please use this template for the explanation: "The word X should be deleted/inserted/replaced by Y because..."

### Input:
Keeping the Secret of Genetic Testing

### Response:assistant


