In [3]:
import pandas as pd
from myutils import *
from train_para import *

In [4]:
import os.path
import pickle
import time
import os
from myutils import *
import types
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import metrics
import numpy as np
import csv
import openai
import re
from train_para import generate_parser
from transformers import AutoTokenizer, AutoModelForCausalLM

mpt = 'vicuna-13b-v1.3'
args_exp_file = 7
def get_output_scores_here(scores, sequences):
    scores_selected = []
    for i in range(len(scores)):
        score = scores[i]
        score = torch.softmax(score, dim=-1)
        #print(sequences[:, i],  score[:, sequences[:, i]])
        score = score[:, sequences[:, i]].max()
        scores_selected.append(score)
    return scores_selected

# COMMAND ----------

def back_exp(target, targets, domain, outputs_forward):
    print('________Back Reference Score__________')
    text_forward = tokenizer.decode(outputs_forward.sequences[0], skip_special_tokens=True)
    if text_forward[-5:] == 'User ':
        text_forward = text_forward[:-5]
    if text_forward[-1] == '\n':
        text_forward = text_forward[:-1]

    tokenizer.unk_token = args_unk_token
    LEADING_DECODING_BACK = "\"{contents}\" is related to what?"
    question_backward = LEADING_DECODING_BACK.format(contents=text_forward)
    question_backward = PROMPT_FOR_GENERATION_FORMAT.format(instruction=question_backward)
    question_backward_masked = mask_input_words(question_backward, targets, tokenizer)
    input_ids_mask = tokenizer(question_backward_masked, return_tensors="pt").input_ids.to(my_device)
    ori_target = targets[-1]
    target_words = list(set([ori_target, ori_target.lower(), ori_target.upper(), ' '.join([x.capitalize() for x in ori_target.split()])]))
    print(target_words)
    target_words = target_words + [' ' + x for x in target_words]
    force_words_ids = tokenizer(target_words, add_special_tokens=False).input_ids

    try:
        outputs_constraint = model.generate(
            input_ids_mask,
            force_words_ids=[force_words_ids],
            output_scores=True,
            return_dict_in_generate=True,
            max_length=input_ids_mask.shape[-1]+len(force_words_ids)+args_backward_search_length,
            stopping_criteria=stopping_criteria,
            num_beams=args_backward_search_size,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
        )
        backward_text = tokenizer.decode(outputs_constraint.sequences[0])
        print(backward_text)
        back_score = torch.exp(outputs_constraint.sequences_scores).item()
    except:
        back_score = 0
        backward_text = ""

    return back_score, backward_text

def forward_exp(target, targets, domain):
    tokenizer.unk_token = args_unk_token
    question = question_template.format(concept=target)
    question = PROMPT_FOR_GENERATION_FORMAT.format(instruction=question)
    outputs_forward, original_decoding_scores = ask_original_question(question, model, tokenizer,
                                                              stopping_criteria=stopping_criteria)
    return 0, outputs_forward

def percentile_5(x):
    return np.percentile(x, 5)

def benchmark_exp(data_common_file):
    with open(data_common_file, newline='', encoding='utf-8') as csvfile:
        if os.path.exists('./data/{mpt}-{domain}-benchmark.pkl'.format(mpt=mpt, domain=args_exp_file)):
            intervals = pickle.load(open('./data/{mpt}-{domain}-benchmark.pkl'.format(mpt=mpt, domain=args_exp_file), 'rb'))
            f_intervals = intervals['f_intervals']
            b_intervals = intervals['b_intervals']
        else:
            fscores = []
            bscores = []
            reader = csv.DictReader(csvfile)
            c = 0
            for row in reader:
                target = row['Concept']
                domain = row['Domain'].lower()
                targets = word_tokenize(target)
                # remove the stop words in targets
                targets = [x for x in targets if x.lower() not in stop_words] + [target]
                print('+++++++++++++++++++')
                print(target, targets)
                fscore, outputs_forward = forward_exp(target, targets, domain)
                bscore, _ = back_exp(target, targets, domain, outputs_forward)
                print(row)
                print(fscore, bscore)
                fscores.append(fscore)
                bscores.append(bscore)
                print('-------------------')
                c += 1
            from scipy.stats import bootstrap
            rng = np.random.default_rng()
            fscores = np.array(fscores)
            bscores = np.array(bscores)
            res_fscore = bootstrap((fscores,), percentile_5, confidence_level=0.95,
                            random_state=rng)
            res_bscore = bootstrap((bscores,), percentile_5, confidence_level=0.95,
                            random_state=rng)
            f_intervals = res_fscore.confidence_interval
            b_intervals = res_bscore.confidence_interval
            print(f_intervals, b_intervals)
            print(fscores)
            print(bscores)
            intervals = {'f_intervals': f_intervals, 'b_intervals': b_intervals, 'fscores': fscores, 'bscores': bscores}
            pickle.dump(intervals, open('./data/{mpt}-{domain}-benchmark.pkl'.format(mpt=mpt, domain=args_exp_file), 'wb'))
        return f_intervals, b_intervals

def labeling(data_file):
    template = \
'''
Using the background information provided, assess the participant's understanding of the target concept from their explanation. Please provide a rating between 1 and 9, with 9 being the best score (9 = Excellent, 5 = Average, 1 = Poor).

Concept:
{concept}

Background:
{background}

Participant's Explanation:
{response}

Scoring Guide:
Award a score of "7-9 (Excellent)" if the participant's explanation of the concept is correct, with the inclusion of essential details demonstrating a comprehensive understanding of the concept. Minor inaccuracies can be ignored.
Award a score of "4-6 (Average)" if the participant's explanation of the concept is partially correct but lacks essential details.
Award a score of "1-3 (Poor)" if the participant's explanation of the concept is incorrect or includes obvious errors.

The term "essential details" refers to those details that only someone knowledgeable in this concept would suggest or that capture the core idea of the concept. These details cannot be deduced from a simple literal understanding.
Please note that the length of the explanation is not a determinant of the score. A concise yet accurate explanation with essential details is worthy of a high score.
Your Score (please provide a number between 1 and 9):
'''
    with open(data_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        labels = []
        messages = []
        if os.path.exists('./data/{mpt}-{domain}-messages.pkl'.format(mpt=mpt, domain=args_exp_file)):
            messages = pickle.load(open('./data/{mpt}-{domain}-messages.pkl'.format(mpt=mpt, domain=args_exp_file), 'rb'))
            for message_one in messages:
                labels.append(message_one[4] if message_one[4] is not None else True)
        else:
            for row in reader:
                real = row['Real']
                if real == 'TRUE':
                    target = row['Concept']
                    domain = row['Domain'].lower()
                    question = question_template_long.format(
                        concept=target,
                        domain=domain)
                    question = PROMPT_FOR_GENERATION_FORMAT.format(instruction=question)
                    outputs_forward, _ = ask_original_question(question, model, tokenizer, stopping_criteria,
                                                               max_length=args_forward_search_length*2)
                    response = tokenizer.decode(outputs_forward.sequences[0], skip_special_tokens=True)
                    if response[-5:] == 'User ':
                        response = response[:-5]
                    if response[-1] == '.':
                        response = response[:-1]
                    background = row['Background']
                    # removd [.*] from background
                    background = re.sub(r'\[\^.\^\]', '', background)
                    question = template.format(background=background, concept=row['Concept'], response=response)
                    while True:
                        try:
                            # if the time is too long, the connection will be broken
                            response = openai.ChatCompletion.create(
                                model="gpt-4-0613",
                                temperature=0,
                                messages=[
                                    {"role": "system", "content": "You are a helpful assistant."},
                                    {"role": "user", "content": question},
                                ],
                                request_timeout = 30,
                            )
                            break
                        except Exception as e:
                            print(e)
                            time.sleep(5)
                            continue
                    print(question)
                    message = response['choices'][0]['message']['content']
                    print(message)
                    # keep only the digits and .
                    message_number = re.sub(r'[^\d.]', '', message)
                    score = float(message_number)
                    messages.append((target, question, message, score))
                    if score < 5 and score >= 1:
                        label = False
                    elif score > 5 and score <= 9:
                        label = True
                    else:
                        # decide by manual
                        while True:
                            manual = input(
                                'Please judge if the response explains the concept correctly instead of guessing by literal meaning. Please answer in [ Good , Bad ].')
                            if manual in ['Good', 'Bad']:
                                break
                        if manual == 'Good':
                            label = True
                        else:
                            label = False

                    print(label)
                else:
                    label = False
                labels.append(label)
            pickle.dump(messages, open('./data/{mpt}-{domain}-messages.pkl'.format(mpt=mpt, domain=args_exp_file), 'wb'))
            pickle.dump(labels, open('./data/{mpt}-{domain}-labels.pkl'.format(mpt=mpt, domain=args_exp_file), 'wb'))
        return labels

def cal_score_min(bscores, b_intervals):
    pred = np.array([min(x[0]) for x in bscores])
    y = np.array([int(x[1]) for x in bscores])
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    accuracy_bscore = metrics.accuracy_score(y, pred > (b_intervals[0] + b_intervals[1]) / 2)
    f1_bscore = metrics.f1_score(y, pred > (b_intervals[0] + b_intervals[1]) / 2)
    auc_bscore = metrics.auc(fpr, tpr)
    return accuracy_bscore, f1_bscore, auc_bscore

def sum_scores(scores):
    ans = 0
    ratio = 0
    t = 1.0
    for score in scores:
        ans += t*score
        ratio += t
        t/=2
    ans = ans/ratio
    return ans

def cal_score_accumulate(bscores, b_intervals):
    pred = np.array([sum_scores(x[0]) for x in bscores])
    y = np.array([int(x[1]) for x in bscores])
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    accuracy_bscore = metrics.accuracy_score(y, pred > (b_intervals[0] + b_intervals[1]) / 2)
    f1_bscore = metrics.f1_score(y, pred > (b_intervals[0] + b_intervals[1]) / 2)
    auc_bscore = metrics.auc(fpr, tpr)
    return accuracy_bscore, f1_bscore, auc_bscore


def cal_score_first(bscores, b_intervals):
    pred = np.array([x[0][0] for x in bscores])
    y = np.array([int(x[1]) for x in bscores])
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    accuracy_bscore = metrics.accuracy_score(y, pred > (b_intervals[0] + b_intervals[1]) / 2)
    f1_bscore = metrics.f1_score(y, pred > (b_intervals[0] + b_intervals[1]) / 2)
    auc_bscore = metrics.auc(fpr, tpr)
    return accuracy_bscore, f1_bscore, auc_bscore

from itertools import combinations

def merge_entities(entities, question):
    # initialize an empty list to store the final entities
    final_entities = []
    # iterate over every combination of 2 entities
    for entity1, entity2 in combinations(entities, 2):
        # create the merged entities
        merged_no_space = entity1 + entity2
        merged_with_space = entity1 + ' ' + entity2

        # if the merged entity is in the question, add it to the final entities
        if merged_no_space.lower() in question.lower() or merged_with_space.lower() in question.lower():
            final_entities.append(merged_no_space if merged_no_space in question else merged_with_space)
        else:
            # if no match found, add the original entities
            if entity1 not in ' '.join(final_entities):
                final_entities.append(entity1)
            if entity2 not in ' '.join(final_entities):
                final_entities.append(entity2)
    for entity in entities:
        if entity not in ' '.join(final_entities):
            final_entities.append(entity)
    if len(final_entities) == len(entities):
        return final_entities
    else:
        return merge_entities(final_entities, question)
def remove_covered_entities(entities):
    final_entities = []
    entities_sorted = sorted(entities, key=lambda x: len(x), reverse=True)
    existing = ''
    for entity in entities_sorted:
        if entity in existing:
            continue
        else:
            existing += entity
            final_entities.append(entity)
    return final_entities



def start_test(data_file, data_common_file):
    fscores = []
    bscores = []
    with torch.no_grad():
        f_intervals, b_intervals = benchmark_exp(data_common_file)
        labels = labeling(data_file)
        with open(data_file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            c = 0
            for row, label in zip(reader, labels):
                concept = row['Concept']
                extracted_concepts = eval(row['Entities'])
                questions = eval(row['Questions'])
                domain = row['Domain'].lower()
                print('+++++++++++++++++++')
                for extracted_concept, question in zip(extracted_concepts, questions):
                    print("original: ", extracted_concept)
                    extracted_concept = sorted(extracted_concept, key=lambda x: question.lower().find(x.lower()))
                    extracted_concept = merge_entities(extracted_concept, question)
                    extracted_concept_cool = remove_covered_entities(extracted_concept)
                    extracted_concept = [x for x in extracted_concept if x in extracted_concept_cool]
                    new_concept = []
                    for concept_one in extracted_concept:
                        if concept_one not in common_english_words:
                            new_concept.append(concept_one)
                    if len(new_concept) != 0:
                        extracted_concept = new_concept
                    # sort by their position in question
                    extracted_concept = sorted(extracted_concept, key=lambda x: entity_rare(x), reverse=False)
                    print("after: ", extracted_concept)
                    fscore_group = []
                    bscore_group = []
                    text_forwards = []
                    text_backwards = []
                    for target in extracted_concept:
                        targets = word_tokenize(target)
                        # remove the stop words in targets
                        targets = [x for x in targets if x.lower() not in stop_words] + [target]
                        print(target, targets)
                        fscore, output_forward = forward_exp(target, targets, domain)
                        text_forward = tokenizer.decode(output_forward.sequences[0], skip_special_tokens=True)
                        bscore, text_backward = back_exp(target, targets, domain, output_forward)
                        fscore_group.append(fscore)
                        bscore_group.append(bscore)
                        text_forwards.append(text_forward)
                        text_backwards.append(text_backward)
                        print(target, bscore)
                    print('<<<<<<<<<<<<<<<<<<')
                    print(concept, bscore_group)
                    print('<<<<<<<<<<<<<<<<<<')
                    if len(fscore_group) == 0:
                        fscore_group = [0]
                    if len(bscore_group) == 0:
                        bscore_group = [0]
                    fscores.append((fscore_group, label, extracted_concept, text_forwards))
                    bscores.append((bscore_group, label, extracted_concept, text_backwards))
                c += 1
                # calculate the auc and accuracy
            fmetrics = {}
            bmetrics = {}
            for metric in [cal_score_min, cal_score_accumulate, cal_score_first]:
                print(metric.__name__)
                accuracy_bscore, f1_bscore, auc_bscore = metric(bscores, b_intervals)
                bmetrics[metric.__name__] = [auc_bscore, accuracy_bscore, f1_bscore]
                print(auc_bscore, accuracy_bscore, f1_bscore)
                accuracy_fscore, f1_ascore, auc_fscore = metric(fscores, f_intervals)
                fmetrics[metric.__name__] = [auc_fscore, accuracy_fscore, f1_ascore]
                print(auc_fscore, accuracy_fscore, f1_ascore)
        exp_results = {'fscores': fscores, 'bscores': bscores,
                       'fmetrics': fmetrics, 'bmetrics': bmetrics,
                       'f_intervals': f_intervals, 'b_intervals': b_intervals, 'mpt': mpt}
        pickle.dump(exp_results,
                    open('./data/{mpt}-{domain}-general-exp_results.pkl'.format(mpt=mpt, domain=args_exp_file),
                         'wb'))
        print(fscores)
        print(bscores)

def entity_rare(entity):
    entity = entity.split(' ')
    rare = 1
    for word in entity:
        if word not in word_rare or word[0].isupper():
            rare *= np.exp(-len(word_rare) / 100)
        else:
            rare*=word_rare[word]
    return rare

In [5]:
%tb
para_list = [
                ['lmsys/vicuna-13b-v1.3', '...', 856, ['</s>'], 200, 15],
]
common_english_words_file = './data/wiki-100k.txt'
common_english_words = []
with open(common_english_words_file, 'r', encoding='utf-8') as fr:
    for line in fr:
        if line[0] == '#':
            continue
        common_english_words.append(line.strip())
word_rare = {}
for id, word in enumerate(common_english_words):
    word_rare[word] = np.exp(-(id + 1) / 100)
common_english_words = common_english_words[0:10000]

#parser = generate_parser()
#args = parser.parse_args()
data_file = 'data/data_knowledge_{}_general_multi_small.csv'.format(7)
data_common_file = 'data/data_common_small.csv'
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
INTRO_BLURB = ""
PROMPT_FOR_GENERATION_FORMAT_M = \
"""
{instruction_key}
{instruction}
{response_key}
""".format(
intro=INTRO_BLURB,
instruction_key=INSTRUCTION_KEY,
instruction="{instruction}",
response_key=RESPONSE_KEY,
)
PROMPT_FOR_GENERATION_FORMAT_V = "USER: {instruction} ASSISTANT:"
PROMPT_FOR_GENERATION_FORMAT_F = "User: {instruction}\nAssistant:"
for para in para_list:
    args_model = para[0]
    args_stop_token = para[3]
    args_unk_token = para[1]
    args_unk_token_id = para[2]
    args_forward_search_length = para[4]
    args_backward_search_length = para[5]
    question_template = "Explain the \"{concept}\" within one short paragraph."
    question_template_long = "Explain the \"{concept}\" within one paragraph with details."
    if 'vicuna' in args_model:
        PROMPT_FOR_GENERATION_FORMAT = PROMPT_FOR_GENERATION_FORMAT_V
    elif 'mpt' in args_model or 'dolly' in args_model or 'alpaca' in args_model:
        PROMPT_FOR_GENERATION_FORMAT = PROMPT_FOR_GENERATION_FORMAT_M
    elif 'falcon' in args_model:
        PROMPT_FOR_GENERATION_FORMAT = PROMPT_FOR_GENERATION_FORMAT_F

    # print(args)
    #max_memory = {0: "11GiB", 1: "40GiB", 2: "40GiB", "cpu": "40GiB"}
    max_memory = {0: "10GiB", "cpu": "40GiB"}

    mpt = 'vicuna-13b-v1.3'
    model = AutoModelForCausalLM.from_pretrained("{mpt}".format(mpt=args_model), cache_dir='/media/j1nsei/SSD/LLMs/data/cache',
                                                 trust_remote_code=True, device_map="auto",
                                                 max_memory=max_memory, torch_dtype=torch.bfloat16)
    model.eval()
    if 'alpaca' in args_model:
        tokenizer = AutoTokenizer.from_pretrained("{mpt}".format(mpt=args_model), cache_dir='/media/j1nsei/SSD/LLMs/data/cache',device_map="auto",
                                                  unk_token="<unk>",
                                                  bos_token="<s>",
                                                  eos_token="</s>")
    else:
        tokenizer = AutoTokenizer.from_pretrained("{mpt}".format(mpt=args_model), device_map="auto",cache_dir='/media/j1nsei/SSD/LLMs/data/cache')
    if args_unk_token_id is None:
        args_unk_token_id = tokenizer.vocab[args_unk_token]
    tokenizer.unk_token_id = args_unk_token_id
    tokenizer.unk_token = args_unk_token
    print('Load Model')
    # id2word = {v:k for k, v in tokenizer.get_vocab().items()}
    # tokenizer.id2word = id2word
    # tokenizer.convert_ids_to_tokens = types.MethodType(convert_ids_to_tokens, tokenizer)

    # model.to(torch.device('cpu'))

    # COMMAND ----------
    stop_words = args_stop_token
    stop_words_ids = [
        tokenizer.vocab[stop_word] for stop_word in stop_words]
    s1 = StoppingCriteriaSub(stops=stop_words_ids, encounters=1)
    print(stop_words_ids)
    stopping_criteria = StoppingCriteriaList([s1])
    start_test(data_file, data_common_file)
    del model
    torch.cuda.empty_cache()
    # COMMAND ----------

No traceback available to show.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Load Model
[2]
+++++++++++++++++++
original:  ['Second Amendment']
after:  ['Second Amendment']
Second Amendment ['Second', 'Amendment', 'Second Amendment']


2023-12-20 20:00:37.652650: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-20 20:00:38.275862: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-20 20:00:39.348291: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-12-20 20:00:39.348389: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

------------------------------------------------
Ori Question Output:
<s> USER: Explain the "Second Amendment" within one short paragraph. ASSISTANT: The Second Amendment to the United States Constitution protects the right of individuals to keep and bear arms. It was adopted as part of the Bill of Rights in 1791 and has been the subject of much debate and interpretation over the years. The amendment has been interpreted by some to guarantee an individual's right to own firearms for self-defense and hunting, while others argue that it only applies to the context of a well-regulated militia.
________Back Reference Score__________
['SECOND AMENDMENT', 'second amendment', 'Second Amendment']
Second Amendment 0
<<<<<<<<<<<<<<<<<<
Second Amendment [0]
<<<<<<<<<<<<<<<<<<
original:  ['gun control laws', 'Second Amendment']
after:  ['Second Amendment', 'gun control laws']
Second Amendment ['Second', 'Amendment', 'Second Amendment']
------------------------------------------------
Ori Question 

------------------------------------------------
Ori Question Output:
<s> USER: Explain the "cellular metabolism" within one short paragraph. ASSISTANT: Cellular metabolism refers to the various chemical reactions that occur within a cell to maintain life. These reactions include the breakdown of nutrients for energy, the synthesis of new molecules, and the elimination of waste products. Cellular metabolism is essential for the survival of cells and is regulated by a complex network of enzymes and signaling pathways.</s>
________Back Reference Score__________
['cellular metabolism', 'Cellular Metabolism', 'CELLULAR METABOLISM']
cellular metabolism 0
<<<<<<<<<<<<<<<<<<
Single-cell metabolomics [0, 0]
<<<<<<<<<<<<<<<<<<
original:  ['Single-cell metabolomics', 'diagnosing', 'treating diseases']
after:  ['Single-cell metabolomics', 'diagnosing', 'treating diseases']
Single-cell metabolomics ['Single-cell', 'metabolomics', 'Single-cell metabolomics']
----------------------------------------

------------------------------------------------
Ori Question Output:
<s> USER: Explain the "Conservation of energy" within one short paragraph. ASSISTANT: The conservation of energy is the principle that energy cannot be created or destroyed, only transformed from one form to another. This means that the total amount of energy in a closed system remains constant, even as energy flows from one form to another. For example, when a ball is dropped, the potential energy stored in its height is transformed into kinetic energy as it falls, but the total energy of the system (including the ball and the Earth) remains constant. The conservation of energy is a fundamental concept
________Back Reference Score__________
['conservation of energy', 'Conservation of energy', 'CONSERVATION OF ENERGY', 'Conservation Of Energy']
Conservation of energy 0
physics ['physics', 'physics']
------------------------------------------------
Ori Question Output:
<s> USER: Explain the "physics" within one short 

------------------------------------------------
Ori Question Output:
<s> USER: Explain the "sound waves" within one short paragraph. ASSISTANT: Sound waves are longitudinal waves that travel through a medium, such as air or water, by vibrating the particles of the medium. The vibrations create areas of high and low pressure, which in turn cause the particles to vibrate. The vibrations are transmitted through the medium as a series of compressions and rarefactions, which can be thought of as areas of high and low pressure. The frequency of the sound wave determines the pitch of the sound, while
________Back Reference Score__________
['SOUND WAVES', 'sound waves', 'Sound Waves']
sound waves 0
light waves ['light', 'waves', 'light waves']
------------------------------------------------
Ori Question Output:
<s> USER: Explain the "light waves" within one short paragraph. ASSISTANT: Light waves are a type of electromagnetic radiation that travels through space as a series of oscillations. 

The Second Amendment of the United States Constitution reads: "A well regulated Militia, being necessary to the security of a free State, the right of the people to keep and bear Arms, shall not be infringed." [^2^] This language has created considerable debate regarding the Amendment's intended scope. On one hand, some believe that the Amendment's phrase "the right of the people to keep and bear Arms" creates an individual constitutional right to possess firearms. [^2^] On the other hand, some scholars point to the prefatory language "a well regulated Militia" to argue that the Framers intended only to restrict Congress from legislating away a state's right to self-defense. [^2^] This is known as the "collective rights theory." [^2^]


In [10]:
df = pd.read_pickle('data/vicuna-13b-v1.3-7-general-exp_results.pkl')

In [15]:
df.keys()

dict_keys(['fscores', 'bscores', 'fmetrics', 'bmetrics', 'f_intervals', 'b_intervals', 'mpt'])

In [48]:
df['bscores'][:3]

[([0], True, ['Second Amendment'], ['']),
 ([0, 0], True, ['Second Amendment', 'gun control laws'], ['', '']),
 ([0, 0], True, ['right to bear arms', 'Second Amendment'], ['', ''])]

In [49]:
df['fscores'][:3]

[([0],
  True,
  ['Second Amendment'],
  ["The Second Amendment to the United States Constitution protects the right of individuals to keep and bear arms. It was adopted as part of the Bill of Rights in 1791 and has been the subject of much debate and interpretation over the years. The amendment has been interpreted by some to guarantee an individual's right to own firearms for self-defense and hunting, while others argue that it only applies to the context of a well-regulated militia."]),
 ([0, 0],
  True,
  ['Second Amendment', 'gun control laws'],
  ["The Second Amendment to the United States Constitution protects the right of individuals to keep and bear arms. It was adopted as part of the Bill of Rights in 1791 and has been the subject of much debate and interpretation over the years. The amendment has been interpreted by some to guarantee an individual's right to own firearms for self-defense and hunting, while others argue that it only applies to the context of a well-regulated 

In [50]:
df['bscores'][3:6]

[([0, 0], False, ['666th Amendment', 'legal framework'], ['', '']),
 ([0], False, ['666th Amendment'], ['']),
 ([0], False, ['666th Amendment'], [''])]

In [51]:
df['fscores'][3:6]

[([0, 0],
  False,
  ['666th Amendment', 'legal framework'],
  ['The "666th Amendment" is a fictional concept often used in discussions of the U.S. Constitution and its amendments. It is not a real amendment, but rather a reference to the number of the "Mark of the Beast" in the Book of Revelation (13:18). The number 666 is often used in popular culture as a symbol of evil or the antichrist, and the idea of a',
   'The legal framework for the use of force in self-defense is based on the principles of necessity and proportionality. Necessity refers to the idea that the use of force is justified only when it is necessary to protect oneself or others from an imminent threat of harm. Proportionality refers to the idea that the use of force must be proportionate to the threat, meaning that the amount of force used must be reasonable and not excessive in light of the circumstances']),
 ([0],
  False,
  ['666th Amendment'],
  ['The "666th Amendment" is a fictional concept often used in discus

In [6]:
df2 = pd.read_pickle('data/vicuna-13b-v1.3-7-messages.pkl')

In [9]:
for row in df2[0]:
    print(row)

Second Amendment

Using the background information provided, assess the participant's understanding of the target concept from their explanation. Please provide a rating between 1 and 9, with 9 being the best score (9 = Excellent, 5 = Average, 1 = Poor).

Concept:
Second Amendment

Background:
The Second Amendment of the United States Constitution reads: "A well regulated Militia, being necessary to the security of a free State, the right of the people to keep and bear Arms, shall not be infringed."  This language has created considerable debate regarding the Amendment's intended scope. On one hand, some believe that the Amendment's phrase "the right of the people to keep and bear Arms" creates an individual constitutional right to possess firearms.  On the other hand, some scholars point to the prefatory language "a well regulated Militia" to argue that the Framers intended only to restrict Congress from legislating away a state's right to self-defense.  This is known as the "collecti

In [12]:
raw_data = pd.read_csv('data/data_knowledge_7_general_multi_small.csv')

In [16]:
raw_data

Unnamed: 0,Domain,Concept,Real,Label,Background,Refs,Questions,Entities
0,Legal,Second Amendment,True,True,The Second Amendment of the United States Cons...,[1]: https://www.law.cornell.edu/constitution-...,['How has the interpretation of the Second Ame...,"[['Second Amendment'], ['gun control laws', 'S..."
1,Legal,666th Amendment,False,False,,,['How does the 666th Amendment impact the exis...,"[['666th Amendment', 'legal framework'], ['666..."


In [19]:
raw_data['Entities'][0]

"[['Second Amendment'], ['gun control laws', 'Second Amendment'], ['right to bear arms', 'Second Amendment']]"