In [1]:
import json
import os
import sys
from typing import Dict, List

import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration

import trlx
from trlx.data.configs import (
    ModelConfig,
    OptimizerConfig,
    SchedulerConfig,
    TokenizerConfig,
    TrainConfig,
    TRLConfig,
)
from trlx.models.modeling_ppo import PPOConfig



### get reward from 

def get_positive_score(scores):
    "Extract value associated with a positive sentiment from pipeline's output"
    return dict(map(lambda x: tuple(x.values()), scores))["POSITIVE"]


default_config = TRLConfig(
    train=TrainConfig(
        seq_length=1024,
        epochs=4,
        total_steps=6000,
        batch_size=12,
        checkpoint_interval=95,
        eval_interval=20,
        pipeline="PromptPipeline",
        trainer="AcceleratePPOTrainer",
        save_best=False,
        tracker="wandb",
        checkpoint_dir='/root/autodl-tmp/msc_ml/t5_large_checkpoints'
    ),
    model=ModelConfig(
        model_path="/root/autodl-tmp/flan-t5-large",
        num_layers_unfrozen=-1,
        model_arch_type="seq2seq",
    ),
    tokenizer=TokenizerConfig(
        tokenizer_path="/root/autodl-tmp/flan-t5-large",
        padding_side="right",
        truncation_side="right",
    ),
    optimizer=OptimizerConfig(
        name="adamw",
        kwargs={
            "lr": 1.0e-4,
            "betas": [0.9, 0.999],
            "eps": 1.0e-8,
            "weight_decay": 1.0e-6,
        },
    ),
    scheduler=SchedulerConfig(
        name="cosine_annealing",
        kwargs={
            "T_max": 100000,
            "eta_min": 5.0e-5,
        },
    ),
    method=PPOConfig(
        name="PPOConfig",
        ### reduce rollouts due to small dataset
        num_rollouts=64,
        chunk_size=12,
        ppo_epochs=4,
        init_kl_coef=0.05,
        target=6,
        horizon=1000,
        gamma=0.99,
        lam=0.95,
        cliprange=0.2,
        cliprange_value=0.2,
        vf_coef=1,
        scale_reward=None,
        ref_mean=None,
        ref_std=None,
        cliprange_reward=10,
        gen_kwargs={
            "max_new_tokens": 256,
            "do_sample": False,
            "top_k": 50,
            "top_p": 0.95,
            "eos_token_id": -1,
            "temperature": 1.0,
        },
    ),
)


def main(hparams={}):
    config = TRLConfig.update(default_config, hparams)


    
    #########################################b
    
    ### reward_se
    def reward_se( prompts: List[str], outputs: List[str], **kwargs) -> List[float]:

        rewards = []
        for q, a in zip(prompts, outputs):
            feedback_prompt = f'Is the answer to the question correct? The question is: {q}. The answer is: {a}'
            feedback = se_generator(feedback_prompt)[0]['generated_text']  # Assuming 'model' is your trained T5 model
            feedback = feedback.lower().strip()
            print(feedback)
            reward = 1.0 if feedback == 'yes' else 0.0

            rewards.append(reward)
        return rewards
    
    
    
    
    
    ### metric_se

    
    def metric_se(samples: List[str], prompts: List[str], outputs: List[str]) -> Dict[str, List[float]]:
        match=[]
        
        for i,prompt in enumerate(prompts):

            index = prompt_all_new.index(prompt)
            if outputs[i].lower().strip()==answer_all[index].lower().strip():
                is_correct=1.0
            else:
                is_correct=0.0
                
            match.append(is_correct)

        return {"Answer Matching": match}
    
    ###########################################e
    
    
    
    ############################b
    # Load the model
    model_se = T5ForConditionalGeneration.from_pretrained("/root/autodl-tmp/flan-t5-large")

    # Load the tokenizer
    tokenizer_se = T5Tokenizer.from_pretrained("/root/autodl-tmp/flan-t5-large")

    # Create the pipeline
    se_generator = pipeline("text2text-generation", model=model_se, tokenizer=tokenizer_se,
        top_k=50,
        truncation=True,
        batch_size=256,
        device=0 if int(os.environ.get("LOCAL_RANK", 0)) == 0 else -1,)
    #############################e


    






    
    #########################b

    ds = load_dataset("json", data_files="/root/autodl-tmp/BIG-Bench-Hard/bbh/navigate.json",field="examples")['train']
    ds_split=ds.train_test_split(test_size=0.2)
    prompt_all=ds['input']
    prompt_all_new= [prompt.replace('\n', ' ') for prompt in prompt_all]
    answer_all=ds['target']
    prompt_train=ds_split['train']['input']
    prompt_train=[]
    prompt_test=ds_split['test']['input']
    prompt_train
    

    ##########################e


    trlx.train(
        prompts=prompt_train,
        eval_prompts=prompt_test,
        reward_fn=reward_se,
        metric_fn=metric_se,
        config=config,
    )

    

    
    
if __name__ == "__main__":
    hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
    main(hparams)

In [2]:
def reward_se( prompts: List[str], outputs: List[str]) -> List[float]:
    rewards = []
    for q, a in zip(prompts, outputs):
        feedback_prompt = f'Is the answer to the question correct? The question is: {q}. The answer is: {a}'
        feedback = se_generator(feedback_prompt)[0]['generated_text']  # Assuming 'model' is your trained T5 model
        feedback = feedback.lower().strip()
        print(feedback)
        reward = 1.0 if 'yes' in feedback else 0.0
        
        rewards.append(reward)
    return rewards

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
    
    # Load the model
model_se_0 = T5ForConditionalGeneration.from_pretrained("/root/autodl-tmp/flan-t5-large")

    # Load the tokenizer
tokenizer_se = T5Tokenizer.from_pretrained("/root/autodl-tmp/flan-t5-large")

    # Create the pipeline
se_generator= pipeline("text2text-generation", model=model_se_0, tokenizer=tokenizer_se,
                        do_sample= False,
                        top_k= 50,
                        top_p= 0.95,
                        max_length=256,
                        eos_token_id= tokenizer_se.eos_token_id,
                        temperature= 1.0,

                          device=0 if int(os.environ.get("LOCAL_RANK", 0)) == 0 else -1,)



"""    
    # Load the model
model_se_1 = T5ForConditionalGeneration.from_pretrained("/root/autodl-tmp/msc_ml/t5_large_checkpoints/checkpoint_94")

    # Create the pipeline
se_generator_1 = pipeline("text2text-generation", model=model_se_1, tokenizer=tokenizer_se,

        device=0 if int(os.environ.get("LOCAL_RANK", 0)) == 0 else -1,)
"""

'    \n    # Load the model\nmodel_se_1 = T5ForConditionalGeneration.from_pretrained("/root/autodl-tmp/msc_ml/t5_large_checkpoints/checkpoint_94")\n\n    # Create the pipeline\nse_generator_1 = pipeline("text2text-generation", model=model_se_1, tokenizer=tokenizer_se,\n\n        device=0 if int(os.environ.get("LOCAL_RANK", 0)) == 0 else -1,)\n'

In [4]:
ds = load_dataset("json", data_files="/root/autodl-tmp/BIG-Bench-Hard/bbh/logical_deduction_three_objects.json",field="examples")['train']
ds_split=ds.train_test_split(test_size=0.2)
prompt_all=ds['input']
prompt_all_new= [prompt.replace('\n', ' ') for prompt in prompt_all]
answer_all=ds['target']
prompt_train=ds_split['train']['input']
prompt_test=ds_split['test']['input']
answer_test=ds_split['test']['target']
answer_train=ds_split['train']['target']



prompt_test_new= ['[{}] Let’ s think step by step.'.format(prompt.replace('\n', ' ')) for prompt in prompt_test]
prompt_train_new= ['[{}] Let’ s think step by step.'.format(prompt.replace('\n', ' ')) for prompt in prompt_train]
    

    
    

def accuracy(if_multi:bool ,prompts: List[str], outputs: List[str]) -> Dict[str, List[float]]:
    match=[]
    
    if if_multi==True:
        
        for i,prompt in enumerate(prompts):
            index = prompt_all_new.index(prompt)
            output_valid=""
            valid_letters = re.search(r'the answer:.*(\([A-Za-z]\))', outputs[i], re.IGNORECASE)
            if match:
                output_valid = valid_letters.group(1)  # The first group is the letter inside the parentheses
            if output_valid.lower().strip()==answer_all[index].lower().strip():
                is_correct=1.0
            else:
                is_correct=0.0
                
            match.append(is_correct)
            
            
    else:
        
        for i,prompt in enumerate(prompts):
            index = prompt_all_new.index(prompt)
            if outputs[i].lower().strip()==answer_all[index].lower().strip():
                is_correct=1.0
            else:
                is_correct=0.0

            match.append(is_correct)

    return sum(match)/len(match)

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-fba734bb4c8b83f9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
outputs_0=[]

for i in range(len(prompt_test_new)):
    
    question=prompt_test_new[i]
    answer=answer_test[i]
    print('££££££££££££££££££££££££££',question)
    generation=se_generator(question)[0]['generated_text']
    outputs_0.append(generation)
    print('$$$$$$$$$$$$$',generation)
    print('============',answer)
    print(reward_se(prompts=[question], outputs=[generation]))

££££££££££££££££££££££££££ [The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. A fruit stand sells three fruits: kiwis, loquats, and cantaloupes. The kiwis are less expensive than the loquats. The cantaloupes are less expensive than the kiwis. Options: (A) The kiwis are the cheapest (B) The loquats are the cheapest (C) The cantaloupes are the cheapest] Let’ s think step by step.
$$$$$$$$$$$$$ The kiwis are cheaper than the loquats. The cantaloupes are cheaper than the kiwis. The cheapest answer: (C).
the kiwis are cheaper than the loquats. the cantaloupes are cheaper than the kiwis. the cheapest
[0.0]
££££££££££££££££££££££££££ [The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a branch, there are three birds: an owl, a crow, and a cardinal. The crow is to the left of the owl. The ow

In [28]:
import torch

# Your code that uses CUDA memory

# Empty the CUDA memory
torch.cuda.empty_cache()

In [None]:
outputs_0=[]
for i in range(len(prompt_train)):
    
    question=prompt_train[i]
    answer=answer_train[i]
    #print('££££££££££££££££££££££££££',question)
    generation=se_generator_0(question)[0]['generated_text']
    outputs_0.append(generation)
    #print('$$$$$$$$$$$$$',generation)
    #print('============',answer)
    #print(reward_fn(prompts=[question], outputs=[generation]))
    
outputs_1=[]
for i in range(len(prompt_train)):
    
    question=prompt_train[i]
    answer=answer_train[i]
    #print('££££££££££££££££££££££££££',question)
    generation=se_generator_1(question)[0]['generated_text']
    outputs_1.append(generation)
    #print('$$$$$$$$$$$$$',generation)
    #print('============',answer)
    #print(reward_fn(prompts=[question], outputs=[generation]))
    
accuracy_0=accuracy(prompt_train_new,outputs_0)
print(accuracy_0)
accuracy_1=accuracy(prompt_train_new,outputs_1)
print(accuracy_1)

In [17]:
s = 'THE Answer:: is here: (C)'
match = re.search(r'the answer.*(\([A-Za-z]\))', s, re.IGNORECASE)
if match:
    answer = match.group(1)  # The first group is the letter inside the parentheses
    print(answer) 

(C)
