# Gen Dataset from huggingface

1. Get the dataset from huggingface, convert it to jsonl files
2. Generate corresponding different responses using API, merge them into the probing dataset.

Generate the following files:

- *train_chosen.jsonl*: ['prompt': prompt, 'chosen': chosen example]
- *train_reject.jsonl*: ['prompt': prompt, 'chosen': reject example]
- *train_dpo.jsonl*: ['prompt': prompt, 'chosen': chosen example, 'reject': rejected example]
- *prob_train.jsonl*: ['prompt': prompt, 'y1': y1, 'y2': y2, ...], all from train set
- *prob_test.jsonl*: ['prompt': prompt, 'y1': y1, 'y2': y2, ...], all from test set

1. Chosen (best in 4, sfted)
     - 1.1 Initial response
     - 1.2 Self-rephrase
     - 1.3 GPT-rephrase, semantics keeping
     - 1.4 GPT-rephrase, format keeping
2. Rejected (worst in 4)
     - 2.1 GPT-rephrase, semantics keeping
     - 2.2 GPT-rephrase, format keeping
3. Irrelavent in train (j neq i)
4. Irrelavent in test (j neq i)
5. Irrelavent random HumLang Passage
6. Randomly permute chosen tokens
7. Pure random tokens

In [1]:
import json
import datasets
from typing import Dict, List, Optional, Iterator, Callable, Union, Tuple
from tqdm import trange
import numpy as np
import random
import os
from openai import OpenAI
import sys 
sys.path.append("../..")
from preference_datasets import extract_anthropic_prompt

In [2]:
def read_jsonl(path):
    response_list = []
    with open(path, 'r') as f:
        for line in f:
            response_list.append(json.loads(line))
    return response_list

In [3]:
# ------------- Generate train_chosen and train_reject
def get_original_helpful(split: str = None):
    """Get the original HH dataset.
    Args:
        split: the split of the dataset to be loaded.
    Returns:
        dataset: the original HH dataset.
    """
    print(f'Loading HH dataset (helpful-base split) from Huggingface...')
    if split is None:
        dataset = datasets.load_dataset(
            'Anthropic/hh-rlhf', data_dir='helpful-base', cache_dir='./data'
        )
    else:
        dataset = datasets.load_dataset(
            'Anthropic/hh-rlhf', data_dir='helpful-base', cache_dir='./data',
            split=split
        )
    print('done')
    return dataset

def gen_perm_response(response):
    response_words = response.strip().split(' ')
    len_response = len(response_words)
    random.shuffle(response_words)
    permuted_response = " ".join(str(element) for element in response_words)
    return permuted_response

def get_model_response_init(prompt, idx, model_response):
    assert model_response[idx]['prompt'] == prompt
    response = model_response[idx]['response'][len(prompt):]
    search_term = '\n\nHuman:'
    search_term_idx  = response.find(search_term)
    return response[:search_term_idx]

def get_model_response_selfr(chosen_response, idx, model_response):
    assert model_response[idx]['prompt'].split('\n@@@ Original Sentence: ')[1].split('\n@@@ Rephrase: ')[0]==chosen_response.strip()
    return model_response[idx]['response'].split('\n@@@ Rephrase: ')[-1].strip()

def get_gpt_response(tmp, idx, chosen_or_rej='chosen', res_type='chosen_gptsemantic', gpt_response=None):
    if res_type=='irr_hum':
        return gpt_response[idx][f'{res_type}'].strip()
    assert gpt_response[idx][f'{chosen_or_rej}'].strip()==tmp.strip()
    return gpt_response[idx][f'{res_type}'].strip()

def convert_helpful_base(dataset: List[Dict[str, str]], dict_type='chosen', n_samples=-1, model='pythia2.8b',train_or_test='train') -> List[Dict[str, str]]:
    """Convert the HH dataset to the format described above.
    Args:
        dataset: the original HH helpful-base subset.
        dict_type: which dictionary we would formulate
            - train_chosen
            - train_reject
            - train_dpo
            - prob
    Returns:
        converted_dataset: the converted dataset.
    """
    if dict_type=='prob':
        model_response_initial = read_jsonl(os.path.join('pythia_generated_responses', model, f'prob_{train_or_test}_gen_response.jsonl'))
        model_response_selfr = read_jsonl(os.path.join('pythia_generated_responses', model, f'prob_{train_or_test}_selfr_response.jsonl'))
        gpt_response = read_jsonl(os.path.join('gpt_generated_responses',f'gpt_3.5_{train_or_test}.jsonl'))
    converted_dataset = []
    if n_samples < 0:
        N_SAMPLE = len(dataset)
    else:
        N_SAMPLE = n_samples
    for idx in trange(0, N_SAMPLE):
        ex = dataset[idx]
        prompt = extract_anthropic_prompt(ex['chosen'])
        if dict_type == 'chosen':
            chosen_response = ex['chosen'][len(prompt):]
            converted_dataset.append({'prompt': prompt, 'chosen': chosen_response,})
        elif dict_type == 'reject':
            chosen_response = ex['rejected'][len(prompt):]
            converted_dataset.append({'prompt': prompt, 'chosen': chosen_response,})
        elif dict_type == 'dpo':
            chosen_response = ex['chosen'][len(prompt):]
            rejected_response = ex['rejected'][len(prompt):]
            converted_dataset.append({'prompt': prompt, 'chosen': chosen_response,'rejected': rejected_response,})
        elif dict_type == 'dpo_reverse':
            # ----------- change the role of rej and chosen
            chosen_response = ex['chosen'][len(prompt):]
            rejected_response = ex['rejected'][len(prompt):]
            converted_dataset.append({'prompt': prompt, 'chosen': rejected_response,'rejected': chosen_response,})
        elif dict_type == 'prob':
            # ------ Chosen group: 'chosen', 'chosen_initial', 'chosen_selfr', 'chosen_gptsemantic', 'chosen_gptformat'
            chosen_response = ex['chosen'][len(prompt):]
            chosen_initial_response = get_model_response_init(prompt, idx, model_response_initial)
            chosen_selfr_response = get_model_response_selfr(chosen_response, idx, model_response_selfr)
            chosen_gptsemantic_res = get_gpt_response(chosen_response, idx, 'chosen', 'chosen_gptsemantic', gpt_response)
            chosen_gptformat_res = get_gpt_response(chosen_response, idx, 'chosen', 'chosen_gptformat', gpt_response)
            # ------ Reject group: 'reject', 'reject_gptsemantic', 'reject_gptformat'
            rejected_response = ex['rejected'][len(prompt):]
            reject_gptsemantic_res = get_gpt_response(rejected_response, idx, 'rejected', 'rejected_gptsemantic', gpt_response)
            reject_gptformat_res = get_gpt_response(rejected_response, idx, 'rejected', 'rejected_gptformat', gpt_response)

            # ------ Irrelavent group: 'irr_train', 'irr_test', 'irr_hum'
            while True:
                irr_train_idx = random.randint(0, N_SAMPLE)
                if irr_train_idx != idx:
                    break
            irr_test_idx = random.randint(N_SAMPLE, len(dataset))
            irr_train_prompt = extract_anthropic_prompt(dataset[irr_train_idx]['chosen'])
            irr_test_prompt = extract_anthropic_prompt(dataset[irr_test_idx]['chosen'])
            irr_train_response = dataset[irr_train_idx]['chosen'][len(irr_train_prompt):]
            irr_test_response = dataset[irr_test_idx]['chosen'][len(irr_test_prompt):]
            irr_hum_res = get_gpt_response(rejected_response, idx, 'rejected', 'irr_hum', gpt_response)
            
            # ------ Random Group: 'random_permute', 'random_nonhum'
            rnd_perm_response = gen_perm_response(chosen_response)
            rnd_nonhum_response = gen_perm_response(irr_test_response)
            
            converted_dataset.append({
                'prompt': prompt,
                'chosen': chosen_response,
                'chosen_initial': chosen_initial_response,
                'chosen_selfr': chosen_selfr_response,
                'chosen_gptsemantic':chosen_gptsemantic_res,
                'chosen_gptformat':chosen_gptformat_res,
                'rejected': rejected_response,
                'reject_gptsemantic': reject_gptsemantic_res,
                'reject_gptformat': reject_gptformat_res,
                'irr_train': irr_train_response,
                'irr_test': irr_test_response,
                'irr_hum': irr_hum_res,
                'random_permute' : rnd_perm_response,
                'random_nonhum' : rnd_nonhum_response,
            })
    return converted_dataset

## Generate train and test dpo dataset

In [13]:
from tqdm import tqdm
cl, rl = [], []
for i in tqdm(range(len(train_original_data['chosen']))):
    cl.append(len(train_original_data['chosen'][i]))
    rl.append(len(train_original_data['rejected'][i]))

 16%|███████████▉                                                               | 6963/43835 [18:42<1:39:05,  6.20it/s]


KeyboardInterrupt: 

In [14]:
print(np.mean(cl), np.std(cl))
print(np.mean(rl), np.std(rl))

789.7475226195605 575.0602016758355
732.42926899325 571.3383131150688


In [None]:
len(cl)

In [4]:
train_original_data = get_original_helpful('train')
test_original_data = get_original_helpful('test')

Loading HH dataset (helpful-base split) from Huggingface...


README.md:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


done
Loading HH dataset (helpful-base split) from Huggingface...
done


In [101]:
DATA_NAME = "train_dpo"
converted = convert_helpful_base(train_original_data, 'dpo',n_samples=5000)
with open('%s.jsonl'%DATA_NAME, 'w', newline='\n') as f:
    for i in range(len(converted)):  
        f.write(json.dumps(converted[i]))
        f.write('\n')

100%|██████████| 5000/5000 [00:00<00:00, 15507.87it/s]


In [13]:
# ----------- Create another train_sft_extend using existing train_dpo.json
DATA_NAME = "train_sft_extend"

with open('%s.jsonl'%DATA_NAME, 'w', newline='\n') as fw:
    with open('E://P5_5_SFT_dynamics//finetuning_dynamics//data//helpful-base//train_dpo.jsonl', 'r') as f:
        for line in f:
            prompt = json.loads(line)['prompt']
            chosen = json.loads(line)['chosen']
            rejected = json.loads(line)['rejected']
            fw.write(json.dumps({'prompt': prompt, 'chosen': chosen}))
            fw.write('\n')
            fw.write(json.dumps({'prompt': prompt, 'chosen': rejected}))
            fw.write('\n')
            

# with open('%s.jsonl'%DATA_NAME, 'w', newline='\n') as f:
#     for i in range(len(converted)):  
#         f.write(json.dumps(converted[i]))
#         f.write('\n')

In [7]:
# DATA_NAME = "train_dpo_reverse"
# converted = convert_helpful_base(train_original_data, 'dpo_reverse',n_samples=5000)
# with open('%s.jsonl'%DATA_NAME, 'w', newline='\n') as f:
#     for i in range(len(converted)):  
#         f.write(json.dumps(converted[i]))
#         f.write('\n')

100%|██████████| 5000/5000 [00:00<00:00, 15671.43it/s]


In [29]:
DATA_NAME = "test_dpo"
converted = convert_helpful_base(test_original_data, 'dpo',n_samples=500)
with open('%s.jsonl'%DATA_NAME, 'w', newline='\n') as f:
    for i in range(len(converted)):  
        f.write(json.dumps(converted[i]))
        f.write('\n')

100%|██████████| 500/500 [00:00<00:00, 9406.76it/s]


In [102]:
# ------------- Use this to generate full probing data
DATA_NAME = "prob_train_gen"
converted = convert_helpful_base(train_original_data, 'chosen',n_samples=500)
with open('%s.jsonl'%DATA_NAME, 'w') as f:
    for i in range(len(converted)):  
        f.write(json.dumps(converted[i]))
        f.write('\n')

100%|██████████| 500/500 [00:00<00:00, 16020.66it/s]


In [103]:
DATA_NAME = "prob_test_gen"
converted = convert_helpful_base(test_original_data, 'chosen',n_samples=500)
with open('%s.jsonl'%DATA_NAME, 'w') as f:
    for i in range(len(converted)):  
        f.write(json.dumps(converted[i]))
        f.write('\n')

100%|██████████| 500/500 [00:00<00:00, 14893.70it/s]


### Generating different responses for the prob dataset

1. For *chosen_initial* and *chosen_selfr*, we need to run xxx.py to generate the output, and save the results in xx.json
2. For *gpt* responses and *irr_hum*, we need API's feedback.

In [61]:
DATA_NAME = "formal_prob_train"
MODEL = 'pythia2.8b'
converted = convert_helpful_base(train_original_data, 'prob',n_samples=500,  model=MODEL, train_or_test='train')
with open(f'{DATA_NAME}.jsonl', 'w', newline='\n') as f:
    for i in range(len(converted)):  
        f.write(json.dumps(converted[i]))
        f.write('\n')

DATA_NAME = "formal_prob_test"
MODEL = 'pythia2.8b'
converted = convert_helpful_base(test_original_data, 'prob',n_samples=500,  model=MODEL, train_or_test='test')
with open(f'{DATA_NAME}.jsonl', 'w', newline='\n') as f:
    for i in range(len(converted)):  
        f.write(json.dumps(converted[i]))
        f.write('\n')

100%|██████████| 500/500 [00:00<00:00, 1434.20it/s]
100%|██████████| 500/500 [00:00<00:00, 2617.45it/s]


## Generate rephrase using GPT, save them into jsonl file

{'prompt':[], 'chosen':[], 'chosen_gptsemantic':[], 'chosen_gptformat':[], 'rejected':[], 'reject_gptsemantic':[], 'reject_gptformat':[], 'irr_hum':[]}

In [23]:
def gpt_rephrase_prompt_generator(ref_sentence=None, keep_type='gptsemantic', length=10):
    if keep_type=='gptsemantic':
        rephrase_prompt = f'Given the reference sentence, please generate an output sentence. \
Please use different words as much as possible while keeping the meaning of the reference sentence unchanged. \
Please only return the output sentence.\n\n\
Reference sentence: {ref_sentence}\n\n\
Output: '
    elif keep_type=='gptformat':
        rephrase_prompt = f'Given the reference sentence, please generate an output sentence. \
Please change the meaning of the reference sentence as much as possible while keeping the format of it. \
Please only return the output sentence.\n\n\
Reference sentence: {ref_sentence}\n\n\
Output: ' 
    elif keep_type=='irr_hum':
        rephrase_prompt = 'Please generate a random sentence with %d words.'%length
    else:
        rephrase_prompt = 'There is no response from gpt.'
    return rephrase_prompt

def get_gpt_completion(ref_sentence=None, keep_type='gptsemantic', length=10):
    sys_msg = [{"role": 'system', "content": 'You are a helpful assistant.'}]
    rephrase_prompt = gpt_rephrase_prompt_generator(ref_sentence, keep_type=keep_type, length=length)
    response = GPT_client.chat.completions.create(
                        model = 'gpt-3.5-turbo',
                        messages = sys_msg + [{'role': 'user', 'content': rephrase_prompt}],
                        temperature=0.7,
                        top_p = 1,         
                        logprobs = False,
                        )
    gpt_response = response.choices[0].message.content    
    
    # if keep_type=='gptsemantic':
    #     gpt_response = 'test gptsemantic'
    # elif keep_type=='gptformat':
    #     gpt_response = 'gptformat test'
    # elif keep_type=='irr_hum':
    #     gpt_response = length
    return gpt_response

In [30]:
GPT_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

train_or_test = 'test' #'train'#
response = []
cnt = 0
# if os.path.exists(f'gpt_generated_responses/gpt_3.5_{train_or_test}.jsonl'):
#     os.remove(f'gpt_generated_responses/gpt_3.5_{train_or_test}.jsonl')
with open(f'{train_or_test}_dpo.jsonl', 'r') as f:
    for line in f:
        cnt+=1
        if cnt%10==0:
            print(cnt,end='-')
        if cnt==501:
            break
        prompt = json.loads(line)['prompt'].strip()
        # ------ chosen_gptsemantic and chosen_gptformat
        chosen = json.loads(line)['chosen'].strip()
        chosen_gptsemantic_response = get_gpt_completion(ref_sentence=chosen, keep_type='gptsemantic')
        chosen_gptformat_response = get_gpt_completion(ref_sentence=chosen, keep_type='gptformat')

        # ------ reject_gptsemantic and reject_gptformat
        rejected = json.loads(line)['rejected'].strip()
        rejected_gptsemantic_response = get_gpt_completion(ref_sentence=rejected, keep_type='gptsemantic')
        rejected_gptformat_response = get_gpt_completion(ref_sentence=rejected, keep_type='gptformat')

        irr_hum_response = get_gpt_completion(keep_type='irr_hum', length=len(chosen.split(' ')))
        
        with open(f'gpt_generated_responses/gpt_3.5_{train_or_test}.jsonl', 'a') as f:
            f.write(json.dumps({'prompt':prompt, 
                    'chosen':chosen, 'chosen_gptsemantic':chosen_gptsemantic_response, 'chosen_gptformat':chosen_gptformat_response,
                    'rejected':rejected, 'rejected_gptsemantic':rejected_gptsemantic_response, 'rejected_gptformat':rejected_gptformat_response,
                     'irr_hum':irr_hum_response}))
            f.write('\n')

10-20-30-40-50-60-70-80-90-100-110-120-130-140-150-160-170-180-190-200-210-220-230-240-250-260-270-280-290-300-310-320-330-340-350-360-370-380-390-400-410-420-430-440-450-460-470-480-490-500-