In [1]:
import os
cache_dir = '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME']=cache_dir
os.environ['HF_HUB_CACHE']=cache_dir+'/hub'

In [2]:
import torch
import json
from transformers import AutoTokenizer

from datasets import load_from_disk





In [3]:
# Loading model
hf_token = os.getenv("hf_token")

model_name= "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id



In [4]:
def get_prompt(ex, prompt_template, task_prompt, few_shot, few_shot_examples, n=3):
    prompt=[
        {
            'role':'system',
            'content':prompt_template['system_msg']
        }
    ]
    if few_shot:
        for idx in range(n):
            prompt.extend([
                {
                    'role':'user',
                    'content':prompt_template['user_msg'].format(instruction=task_prompt, question=few_shot_examples[idx]['input'])
                },
                {
                    'role':'assistant',
                    'content':prompt_template['assistant_msg'].format(response=few_shot_examples[idx]['reference'], rationale=few_shot_examples[idx]['rationale'])
                }
                                                                      
            ])
        
    prompt.append(
        {
            'role':'user',
            'content':prompt_template['user_msg'].format(instruction=task_prompt, question=ex)
        }
    )
    
    return prompt

def tokenize_function(example,input_column, prompt_template, task_prompt, few_shot, few_shot_examples, n=3):
    prompt= get_prompt(example[input_column], prompt_template, task_prompt, few_shot, few_shot_examples, n)
    prompt= tokenizer.apply_chat_template(prompt,  tokenize= False, add_generation_prompt=True)
    return {'input_ids': {'prompt_token_ids':tokenizer(prompt, add_special_tokens=False)['input_ids']}}


In [5]:
def tokenize_data(task_name, split, input_column, prompt_template, task_prompt, few_shot, few_shot_examples, n):
    data_path= f'../datasets/{task_name}'
    data = load_from_disk(f"{data_path}/{split}/")
    tokenized_dataset = data.map(lambda x: tokenize_function(x, input_column, prompt_template, task_prompt, few_shot, few_shot_examples, n), batched=False)
    output_path=f"{data_path}/tokenized/LLaMA8B-Instruct/{split}/{n}-shot/"
    print(output_path)
    tokenized_dataset.save_to_disk(output_path)
    return

In [6]:
input_column='text'
for task in ['gec']:
    task_prompt_path=f'../prompts/{task}.json'
    with open(task_prompt_path) as fp:
        task_prompt = json.load(fp)
    prompt_template_path='../prompts/llama.json'
    with open(prompt_template_path) as fp:
        prompt_template = json.load(fp)
    for split in ['feedback']:
        for few_shot in [False]:
            if few_shot:
                for n in [3]:
                    tokenize_data(task, split, input_column, prompt_template, task_prompt['task_prompt'], few_shot, task_prompt['few_shot'], n)
            else:
                tokenize_data(task, split, input_column, prompt_template, task_prompt['task_prompt'], few_shot, task_prompt['few_shot'], 0)

FileNotFoundError: [Errno 2] No such file or directory: '../prompts/gec.json'

In [6]:
# Loading data
task='neutralization'
input_column='input'
data_path= f'../datasets/{task}'
split='val'
data = load_from_disk(f"{data_path}/{split}/")
data

Dataset({
    features: ['id', 'src_tok', 'tgt_tok', 'input', 'edits', 'src_POS_tags', 'tgt_parse_tags', 'wiki_id'],
    num_rows: 700
})

In [7]:
task_prompt_path=f'../prompts/{task}.json'
with open(task_prompt_path) as fp:
    task_prompt = json.load(fp)
prompt_template_path='../prompts/llama.json'
with open(prompt_template_path) as fp:
    prompt_template = json.load(fp)

In [8]:
task_prompt

{'task_prompt': 'Given an input sentence, reason and replace the word with subjective bias to a word with neutral point of view. Consider the following types of biases:\n1. framing biases use subjective words linked with a particular point of view (e.g. using words like best or deepest or using pilfered from instead of based on);\n2. epistemological biases are linguistic features that subtly (often via presupposition) modify the believability of a proposition;\n3. demographic biases are texts with presuppositions about particular genders, races, or other demographic categories (e.g. presupposing that all programmers are male).\n\nYour response should end with "The neutralized text is [answer]" where [answer] is the neutralized version of the input sentence.',
 'system_msg': 'You are an expert in removing subjective biases in texts.',
 'user_msg': 'Instruction:\n{instruction}\n\nInput:\n{question}',
 'assistant_msg': '{rationale} The neutralized text is  {response}\n',
 'few_shot': [{'i

In [9]:
print(prompt_template['assistant_msg'].format(response='0',rationale='0'))

0 The neutralized text is  0



In [10]:
input_column='input'
few_shot=False


In [11]:
prompt=get_prompt(data['input'][0], prompt_template, task_prompt['task_prompt'], few_shot, task_prompt['few_shot'], n=2)
prompt

[{'role': 'system',
  'content': 'You are an expert in removing subjective biases in texts.'},
 {'role': 'user',
  'content': 'Instruction:\nGiven an input sentence, reason and replace the word with subjective bias to a word with neutral point of view. Consider the following types of biases:\n1. framing biases use subjective words linked with a particular point of view (e.g. using words like best or deepest or using pilfered from instead of based on);\n2. epistemological biases are linguistic features that subtly (often via presupposition) modify the believability of a proposition;\n3. demographic biases are texts with presuppositions about particular genders, races, or other demographic categories (e.g. presupposing that all programmers are male).\n\nYour response should end with "The neutralized text is [answer]" where [answer] is the neutralized version of the input sentence.\n\nInput:\nin addition to sponsoring palestinian terror attacks against israel (often through jordanian terr

In [23]:
prompt=tokenizer.apply_chat_template(prompt,  tokenize= False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are an expert in removing subjective biases in texts.<|eot_id|><|start_header_id|>user<|end_header_id|>

Instruction:
Given an input sentence, reason and replace the word with subjective bias to a word with neutral point of view. Consider the following types of biases:
1. framing biases use subjective words linked with a particular point of view (e.g. using words like best or deepest or using pilfered from instead of based on);
2. epistemological biases are linguistic features that subtly (often via presupposition) modify the believability of a proposition;
3. demographic biases are texts with presuppositions about particular genders, races, or other demographic categories (e.g. presupposing that all programmers are male).

Your response should end with "The neutralized text is [answer]" where [answer] is the neutralized version of the input sentence.

Input:


In [23]:
prompt=tokenizer.decode(tokenizer(prompt)['input_ids'],skip_special_tokens=True)
print(prompt)

system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are an expert in removing subjective biases in texts.user

Instruction:
Given an input sentence, reason and replace the word with subjective bias to a word with neutral point of view. Consider the following types of biases:
1. framing biases use subjective words linked with a particular point of view (e.g. using words like best or deepest or using pilfered from instead of based on);
2. epistemological biases are linguistic features that subtly (often via presupposition) modify the believability of a proposition;
3. demographic biases are texts with presuppositions about particular genders, races, or other demographic categories (e.g. presupposing that all programmers are male).

Your response should end with "The neutralized text is [answer]" where [answer] is the neutralized version of the input sentence.

Input:
gender-neutral pronouns used in ancient englishassistant

The word ancient should be replaced by mi

In [29]:
import json
task='neutralization'
task_prompt_path=f'../prompts/{task}.json'
with open(task_prompt_path) as fp:
    task_prompt = json.load(fp)
prompt_template_path='../prompts/llama.json'
with open(prompt_template_path) as fp:
    prompt_template = json.load(fp)

In [28]:
# Loading data
task='neutralization'
input_column='input'
data_path= f'../datasets/{task}'
split='val'
data = load_from_disk(f"{data_path}/{split}/")
data

Dataset({
    features: ['id', 'src_tok', 'tgt_tok', 'input', 'edits', 'src_POS_tags', 'tgt_parse_tags', 'wiki_id'],
    num_rows: 700
})

In [19]:
def formatting_prompts_func_neutralization(example):
    with open('../prompts/neutralization.json') as fp:
        task_prompt = json.load(fp)
    system_msg= f'<|start_header_id|>system<|end_header_id|>\n\n{task_prompt['system_msg']}<|eot_id|>'
    user_msg= f'<|start_header_id|>user<|end_header_id|>\n\n{task_prompt['user_msg'].format(instruction=task_prompt['task_prompt'], question=example['input'])}<|eot_id|>'
    rationale= example['rationale'] if 'rationale' in example else ''
    assistant_msg=f'<|start_header_id|>assistant<|end_header_id|>\n\n{task_prompt['assistant_msg'].format(rationale=rationale, response=example['edits'])}<|eot_id|>'
    
    text= system_msg + user_msg + assistant_msg
    return text


In [25]:
example={
    'input':'a',
    'edits':'b',
    'rationale':'cas'
}
rationale= example['rationale'] if 'rationale' in example else ''
rationale

'cas'