## Imports

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from tqdm import tqdm
from datasets import load_dataset
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics import SacreBLEUScore
import nltk
nltk.download("punkt")

import pandas as pd

from collections import defaultdict
import gc

## Define metrics

In [None]:
rouge = ROUGEScore()
bleu = SacreBLEUScore(1, lowercase=True)

## Define models

In [None]:
if torch.cuda.is_available:
    device = torch.device("cuda")
else:
    device = "cpu"

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Myashka/125M_GPTneo_reward_gen")
tokenizer.pad_token = tokenizer.eos_token

reward_model = AutoModelForSequenceClassification.from_pretrained("Myashka/125M_GPTneo_reward_gen").to(device)
sft_model = AutoModelForCausalLM.from_pretrained("Myashka/125M_GPTneo_sft_tuned").to(device)

reward_model = reward_model.eval()
sft_model = sft_model.eval()

reward_model = torch.compile(reward_model)
sft_model = torch.compile(sft_model)

## Config

In [None]:
data_config = {'data_file_path': '/content/1.0-data-div-ans-sep-api-usage.json',
               "padding": False,
               "max_length_promt": 256,
               "truncate_promt": True,
               }

generation_kwargs = {
    "min_length": -1,
    "top_k": 1,
    'num_return_sequences': 10,
    "top_p": 1.0,
    "do_sample": True,
    "max_new_tokens": 256,
}

## Data

In [None]:
def build_dataset(
    tokenizer,
    data_config,
    splits,
):

    def promt_tokenize(examples):
        if data_config['truncate_promt']:
            q_toks = tokenizer.encode(examples['Question'])
            q_toks = q_toks[:data_config['max_length_promt']-7]
            tmp = tokenizer.decode(q_toks).strip()
        else:
            tmp = examples['Question']

        sample = 'Question: ' + tmp + "\nAnswer:"

        tokenized_dict = tokenizer(
            [sample], padding=data_config['padding'], max_length=data_config['max_length_promt'], truncation=True)
        
        tokenized_dict['Question_promt'] = sample
        tokenized_dict['Original_answer'] = examples['Answer']

        return tokenized_dict

    datasets = []
    for split in splits:
        dataset = load_dataset(
            "json", data_files=f"{data_config['data_file_path']}", field=f'{split}')['train']
        dataset = dataset.map(promt_tokenize)
        dataset.set_format(type="torch", columns=["input_ids", "Question_promt", 'Original_answer'])
        datasets.append(dataset)
    return datasets

In [None]:
val_dataset = build_dataset(tokenizer, data_config, 'val')[0]

## Generate samples to eval

In [None]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 50,
    'num_return_sequences': 10,
    "top_p": 0.9,
    "do_sample": True,
    "max_new_tokens": 256,
}

In [None]:
val_dict = defaultdict(list)
for i, sample in tqdm(enumerate(val_dataset)):
    generated_samples = sft_model.generate(sample["input_ids"].to(device), **generation_kwargs)

    val_dict['Question'].extend([sample['Question_promt']]*len(generated_samples))
    val_dict['Answer_orig'].extend([sample['Original_answer']]*len(generated_samples))
    val_dict['Q_Id'].extend([i]*len(generated_samples))

    val_dict["Answer_gen"].extend([tokenizer.decode(r.squeeze()[len(query_idx):], skip_special_tokens=True) for r, query_idx in zip(generated_samples, sample["input_ids"].repeat(10, 1))])

    del sample
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
result_df = pd.DataFrame(val_dict)

In [None]:
result_df.to_csv('validation_results.csv', index=False)

## Compute metrics

In [None]:
val_rouge1, val_rouge2, val_bleu = [], [], []

for i in tqdm(range(len(result_df))):
    generated_answer = result_df["Answer_gen"][i]
    original_answer = result_df["Answer_orig"][i]
    
    # calculate Rouge and BLEU scores
    try:
        rouge_score = rouge(generated_answer, original_answer)
        val_rouge1.append(rouge_score['rouge1_fmeasure'].item())
        val_rouge2.append(rouge_score['rouge2_fmeasure'].item())
        val_bleu.append(bleu(generated_answer, original_answer).item())
    except:
      val_rouge1.append(0)
      val_rouge2.append(0)
      val_bleu.append(0)

In [None]:
result_df['rouge_1'] = val_rouge1
result_df['rouge_2'] = val_rouge2
result_df['bleu'] = val_bleu

In [None]:
result_df.to_csv('validation_results.csv', index=False)

## Compute rewards

In [None]:
df = pd.read_csv('/content/validation_results.csv')

In [None]:
text = []
for index, row in tqdm(df.iterrows()):
    q = row['Question']
    a = row['Answer_gen']
    try:
        text.append(q+a)
    except:
        text.append(q)

In [None]:
batch_size = 32
predictions = []

for i in tqdm(range(0, len(text), batch_size)):
    batch = text[i:i+batch_size]
    
    inputs = tokenizer(batch, padding='longest', return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = reward_model(**inputs).logits
    
    batch_predictions = outputs.detach().cpu()
    predictions.append(batch_predictions)

    del batch_predictions
    del inputs
    gc.collect()
    torch.cuda.empty_cache()

predictions = torch.cat(predictions, dim=0)

In [None]:
rewards = [predictions.numpy()[i][0] for i in range(len(predictions))]
df['rewards'] = rewards
df['metric'] = df[['rouge_1', 'rouge_2', 'bleu']].mean(axis=1)

## Check Spearman and Kendall correlations

In [3]:
from scipy.stats import spearmanr, kendalltau
import pandas as pd

In [4]:
df = pd.read_csv(r'D:\CQA_RLHF\reward_model\regression\validation\validation_results.csv')

In [6]:
grouped_data = df.groupby('Q_Id')

In [10]:
rank1 = grouped_data['metric'].rank(method='min', ascending=False)
rank2 = grouped_data['rewards'].rank(method='min', ascending=False)

corr_spearman, _ = spearmanr(rank1, rank2)

corr_kendall, _ = kendalltau(rank1, rank2)

In [15]:
print(f'Spearman correlation: {round(corr_spearman, 2)}\nKendall correlation: {round(corr_kendall, 2)}')

Spearman correlation: 0.09
Kendall correlation: 0.07


In [12]:
pd.set_option("display.max_colwidth", None)

In [17]:
df[df['Q_Id'] == 1]

Unnamed: 0,Question,Answer_orig,Q_Id,Answer_gen,rouge_1,rouge_2,bleu,rewards,metric
10,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"In Python 3.2.2 and later, package names are actually only used as a placeholder for classes.\nFor this reason, you will have to distinguish different packages to see which is not a package-name as much as you would with packages that have a prefix like package-name.",0.268456,0.027211,0.031674,0.300511,0.109114
11,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"From the documentation, it is a known and undocumented technique. I've been using the interpreter from a previous point of view, but I'm not sure what the difference is in my experience, but I think you are using the same method if you try to get to the interpreter and return something similar.",0.233766,0.026316,0.049383,0.13837,0.103155
12,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"To understand the import logic of Python is to understand how the import operator (I) is used in the different ways. It should be clear to you what you expect, though I cannot rule out the difference between the two.\nA more complex example is to be able to explain what python does. Suppose that we import the Python script we are talking about, and the Python interpreter tries to get the same object from A.\nThe Python interpreter does not handle this, but, in Python 2, it handles the object itself by default.\nNow, if the interpreter tries to ""check"" the object A is a package-name/ module-name, then it's doing the same thing. If A is the package-name, then the interpreter tries to get A as a package-name (which, from the Python interpreter's point of view, is a package object). It is doing the same thing as doing it the same way.",0.348837,0.054688,0.060827,0.439226,0.154784
13,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"yes.\nIf the interpreter is going to go from A.b and A.b to A.c, it does not do the same for the module that A.b does.\nIf, on the other hand, the module itself doesn't go from B.c to B.c, it doesn't know.\n(The first case does not really get any further than that.)",0.335404,0.075472,0.046944,0.285275,0.152606
14,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"If you can explain better how Python does it, then this may help.\n\nFrom a very basic point of view, if your program uses Python itself, it will handle package-name, module-name, and function-name in the same way. However, if you're using the Python interpreter, you should not treat these as separate components (even if they're just parts of the same thing).\n",0.30303,0.04908,0.060403,-0.0829,0.137504
15,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"The simplest way to answer this question is to write the code with an explicit function that works like the python function itself (which is not something that can be called in a language other than Python, but could be written like so). If it looks like a function can treat the objects as classes then you could write something like this:\nimport py_py as class from sys.modules.py.",0.282353,0.02381,0.028481,0.131596,0.111548
16,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"You might say you can't interpret the modules into the import modules, and you can do so via package-name, but I have no doubt that if you tried to use a single-instance import the first time the module got imported you would end up with an interpreter that doesn't recognize it, and that's the wrong way to go.\nOn the other hand, if you can't really read the code into the import module, you probably can't use package-name, which is the same approach as package-name, and which means package-name isn't exactly a good idea as long as you know what package is actually imported.",0.28972,0.04717,0.036232,0.276913,0.124374
17,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"I think you are correct about the Python syntax for the second option.\nIf A is a package-name, B is a module-name, and vice versa (I mean there are different types of packages and modules).\nHowever, if you take the second way and think that the interpreter thinks A is a package-name/module-name, then it would be easier to read and write code to handle it (probably in a package-name/module-name format) so that you don't have to change a single class of object into a package-name/module-name.\nThe other way around is to use package-name and module-name as names of the modules.\nIf a package-name/module-name is a package name, it's probably a package-name, not module-name.\nTo me, the best way to approach this problem is to use the Python package manager (pim), which has a good set of useful programming tools and a good set of packages and modules for the package-name, and the second option is to simply use package-name and module-name as you have in the first example, but I don't know if the third option is a better approach.\nIf you are using",0.345277,0.052459,0.047962,0.212113,0.148566
18,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"In Python 3, you can do this with a dictionary of functions. However, if you try to do the same from Python 2 (Python 2.X) then your interpreter treats the objects as different, even if they are the same thing.",0.257143,0.0,0.01734,0.268323,0.091494
19,"Question: If I understand correctly, the python syntax from... import... can be used in two ways\n\nfrom package-name import module-name\nfrom module-name import function-name\n\nI would like to know a bit of how Python internally treats the two different forms. Imagine, for example, that the interpreter gets ""from A import B"", does the interpreter actually try to determine whether A is a package-name/ module-name, or does it internally treat packages and modules as the same class of objects (something like Linux treats files and directories very similarly)?\nAnswer:","First of all, a module is a python file that contains classes and functions. when you say From A Import B python searches for A(a module) in the standard python library and then imports B(the function or class) which is the module if it finds A. If it doesn't it goes out and starts searching in the directory were packages are stored and searches for the package name( A ) and then if it finds it, it imports the Module name(B). If it fails in the past 2 processes it returns an error.\nHope this helps.",1,"I think there is a different way to represent the two different types of data that the interpreter can actually use. It is the same process as the package-name, but in an abstract way.\nThe first approach is using an ""a"" argument (or a ""b"" argument) that the interpreter can assign to the result of the Python process. The reason you are using this approach is because the interpreter has to interpret both a package-name and a module-name.",0.344444,0.067416,0.041436,0.26566,0.151099
