In [6]:
import json
import os
import sys
from typing import Dict, List
import torch

import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration,AutoModelForCausalLM



import trlx
from trlx.data.configs import (
    ModelConfig,
    OptimizerConfig,
    SchedulerConfig,
    TokenizerConfig,
    TrainConfig,
    TRLConfig,
)
from trlx.models.modeling_ppo import PPOConfig

from trlx.models.modeling_ppo import (
    AutoModelForCausalLMWithHydraValueHead,
    AutoModelForCausalLMWithValueHead,
    AutoModelForSeq2SeqLMWithHydraValueHead,
)




default_config = TRLConfig(
    train=TrainConfig(
        seq_length=1024,
        epochs=4,
        total_steps=6000,
        batch_size=4,
        checkpoint_interval=256,
        eval_interval=40,
        pipeline="PromptPipeline",
        trainer="AcceleratePPOTrainer",
        save_best=False,
        tracker="wandb",
        checkpoint_dir='/root/autodl-tmp/msc_ml/t5_large_checkpoints'
    ),
    model=ModelConfig(
        model_path="/root/autodl-tmp/flan-t5-large",
        num_layers_unfrozen=-1,
        model_arch_type="seq2seq",
    ),
    tokenizer=TokenizerConfig(
        tokenizer_path="/root/autodl-tmp/flan-t5-large",
        padding_side="right",
        truncation_side="right",
    ),
    optimizer=OptimizerConfig(
        name="adamw",
        kwargs={
            "lr": 1.0e-4,
            "betas": [0.9, 0.999],
            "eps": 1.0e-8,
            "weight_decay": 1.0e-6,
        },
    ),
    scheduler=SchedulerConfig(
        name="cosine_annealing",
        kwargs={
            "T_max": 100000,
            "eta_min": 5.0e-5,
        },
    ),
    method=PPOConfig(
        name="PPOConfig",
        ### reduce rollouts due to small dataset
        num_rollouts=128,
        chunk_size=12,
        ppo_epochs=4,
        init_kl_coef=0.05,
        target=6,
        horizon=1000,
        gamma=0.99,
        lam=0.95,
        cliprange=0.2,
        cliprange_value=0.2,
        vf_coef=1,
        scale_reward=None,
        ref_mean=None,
        ref_std=None,
        cliprange_reward=10,
        gen_kwargs={
            "max_new_tokens": 128,
            "do_sample": True,
            "top_k": 50,
            "top_p": 0.95,
            "eos_token_id": T5Tokenizer.from_pretrained("/root/autodl-tmp/flan-t5-large").eos_token_id,
            "temperature": 1.0,
        },
    ),
)


def main(hparams={}):
    config = TRLConfig.update(default_config, hparams)


    
    #########################################b
    
    ### reward_se
    def reward_se( prompts: List[str], outputs: List[str], **kwargs) -> List[float]:

        rewards = []
        for q, a in zip(prompts, outputs):
            feedback_prompt = f'Is the answer to the question correct? The question is: {q}. The answer is: {a}'
            feedback = se_generator(feedback_prompt)[0]['generated_text']  # Assuming 'model' is your trained T5 model
            feedback = feedback.lower().strip()
            print(feedback)
            reward = 0.0 
            if 'yes' in feedback:
                reward = 1.0 
                
            elif 'no' in feedback:
                reward = -1.0

            rewards.append(reward)
        return rewards
    
    
    
    
    
    ### metric_se

    
    def metric_se(samples: List[str], prompts: List[str], outputs: List[str]) -> Dict[str, List[float]]:
        match=[]
        
        for i,prompt in enumerate(prompts):

            index = prompt_all_new.index(prompt)
            if outputs[i].lower().strip()==answer_all[index].lower().strip():
                is_correct=1.0
            else:
                is_correct=0.0
                
            match.append(is_correct)

        return {"Answer Matching": match}
    
    ###########################################e
    
    
    
    ############################b
    # Load the model
    model_se = T5ForConditionalGeneration.from_pretrained("/root/autodl-tmp/flan-t5-large")

    # Load the tokenizer
    tokenizer_se = AutoTokenizer.from_pretrained("/root/autodl-tmp/flan-t5-large")

    # Create the pipeline
    se_generator = pipeline("text2text-generation", model=model_se, tokenizer=tokenizer_se,
                        do_sample= False,
                        max_length=64,
                        eos_token_id= tokenizer_se.eos_token_id,
        device=0 if int(os.environ.get("LOCAL_RANK", 0)) == 0 else -1,)
    #############################e


    
    

    
    #########################b

    ds = load_dataset("json", data_files="/root/autodl-tmp/BIG-Bench-Hard/bbh/navigate.json",field="examples")['train']
    ds_split=ds.train_test_split(test_size=0.2)
    
    answer_all=ds['target']
    
    prompt_all=ds['input']
    prompt_train=ds_split['train']['input']
    prompt_test=ds_split['test']['input']

    
    prompt_all_cot= ['[{}] Let’ s think step by step.'.format(prompt.replace('\n', ' ')) for prompt in prompt_all]
    prompt_test_cot= ['[{}] Let’ s think step by step.'.format(prompt.replace('\n', ' ')) for prompt in prompt_test]
    prompt_train_cot= ['[{}] Let’ s think step by step.'.format(prompt.replace('\n', ' ')) for prompt in prompt_train]    

    ##########################e


    trlx.train(
        prompts=prompt_train_cot,
        eval_prompts=prompt_test_cot,
        reward_fn=reward_se,
        #metric_fn=metric_se,
        config=config,
    )

    

    
    
if __name__ == "__main__":
    hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
    
    main(hparams)

In [7]:
    def reward_se( prompts: List[str], outputs: List[str], **kwargs) -> List[float]:

        rewards = []
        for q, a in zip(prompts, outputs):
            feedback_prompt = f'Is the answer to the question correct? The question is: {q}. The answer is: {a}'
            feedback = se_generator(feedback_prompt)[0]['generated_text']  # Assuming 'model' is your trained T5 model
            feedback = feedback.lower().strip()
            print(feedback)
            reward = 0.0 
            if 'yes' in feedback:
                reward = 1.0 
                
            elif 'no' in feedback:
                reward = -1.0

            rewards.append(reward)
        return rewards

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
    
    # Load the model
model_se_0 = T5ForConditionalGeneration.from_pretrained("/root/autodl-tmp/flan-t5-large")

    # Load the tokenizer
tokenizer_se = T5Tokenizer.from_pretrained("/root/autodl-tmp/flan-t5-large")

    # Create the pipeline
se_generator= pipeline("text2text-generation", model=model_se_0, tokenizer=tokenizer_se,
                        do_sample= True,
                        top_k= 50,
                        top_p= 0.95,
                        max_length=100,
                        eos_token_id= -1,
                        temperature= 1.0,
                          device=0 if int(os.environ.get("LOCAL_RANK", 0)) == 0 else -1,)

In [9]:
    # Load the model
model_se_1 = T5ForConditionalGeneration.from_pretrained("/root/autodl-tmp/msc_ml/t5_large_checkpoints")



    # Create the pipeline
se_generator_1 = pipeline("text2text-generation", model=model_se_1, tokenizer=tokenizer_se,
                        do_sample= True,
                        top_k= 50,
                        top_p= 0.95,
                        max_length=100,
                        eos_token_id= -1,
                        temperature= 1.0,
                        device=0 if int(os.environ.get("LOCAL_RANK", 0)) == 0 else -1,)


Some weights of the model checkpoint at /root/autodl-tmp/msc_ml/t5_large_checkpoints were not used when initializing T5ForConditionalGeneration: ['v_head.0.weight', 'v_head.2.bias', 'v_head.0.bias', 'v_head.2.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
ds = load_dataset("json", data_files="/root/autodl-tmp/BIG-Bench-Hard/bbh/navigate.json",field="examples")['train']
ds_split=ds.train_test_split(test_size=0.2)
prompt_all=ds['input']
prompt_all_new= [prompt.replace('\n', ' ') for prompt in prompt_all]
prompt_all_new=['[{}] Let’ s think step by step.'.format(prompt.replace('\n', ' ')) for prompt in prompt_all_new]
answer_all=ds['target']
prompt_train=ds_split['train']['input']
prompt_test=ds_split['test']['input']
answer_test=ds_split['test']['target']
answer_train=ds_split['train']['target']



prompt_test_new= ['[{}] Let’ s think step by step.'.format(prompt.replace('\n', ' ')) for prompt in prompt_test]
prompt_train_new= ['[{}] Let’ s think step by step.'.format(prompt.replace('\n', ' ')) for prompt in prompt_train]
    

    
    

def accuracy(prompts: List[str], outputs: List[str]) -> Dict[str, List[float]]:
    match=[]
        
    for i,prompt in enumerate(prompts):

        index = prompt_all_new.index(prompt)
        if outputs[i].lower().strip()==answer_all[index].lower().strip():
            is_correct=1.0
        else:
            is_correct=0.0
                
        match.append(is_correct)

    return sum(match)/len(match)

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-e5badb54ef3cd267/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
outputs_0=[]
for i in range(len(prompt_train)):
    
    question=prompt_train[i]
    answer=answer_train[i]
    #print('££££££££££££££££££££££££££',question)
    generation=se_generator(question)[0]['generated_text']
    outputs_0.append(generation)
    #print('$$$$$$$$$$$$$',generation)
    #print('============',answer)
    #print(reward_fn(prompts=[question], outputs=[generation]))
    
outputs_1=[]
for i in range(len(prompt_train)):
    
    question=prompt_train[i]
    answer=answer_train[i]
    #print('££££££££££££££££££££££££££',question)
    generation=se_generator_1(question)[0]['generated_text']
    outputs_1.append(generation)
    #print('$$$$$$$$$$$$$',generation)
    #print('============',answer)
    #print(reward_fn(prompts=[question], outputs=[generation]))
    




In [12]:
def accuracy(prompts: List[str], outputs: List[str]) -> Dict[str, List[float]]:
    match=[]
        
    for i,prompt in enumerate(prompts):
        generation=''
        if 'yes' in outputs[i].lower().strip():
            generation='yes'
        elif 'no' in outputs[i].lower().strip():
            generation='no'
            

        index = prompt_all_new.index(prompt)
        if generation==answer_all[index].lower().strip():
            is_correct=1.0
        else:
            is_correct=0.0
                
        match.append(is_correct)

    return sum(match)/len(match)

In [13]:
accuracy_0=accuracy(prompt_train_new,outputs_0)
print(accuracy_0)
accuracy_1=accuracy(prompt_train_new,outputs_1)
print(accuracy_1)

0.455
0.5
