In [19]:
import openai
import os
import pandas as pd
from elasticsearch import Elasticsearch
import evaluate
import spacy

In [35]:
nlp = spacy.load("en_core_web_sm", enable=["ner"])

In [2]:
from typing import List, Dict
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document

def get_num_tokens(content: str):
    return 1.2 * len(content.split())

def create_numbered_list(content: list):
    return "\n".join([f"{i+1}. {c}" for i, c in enumerate(content)])

def create_numbered_list_with_episode(content: List[Dict[str, str]], episode_key: str = "episode_number", content_key: str = "content"):
    return "\n".join([f"Episode {c[episode_key]} - {c[content_key]}" for i, c in enumerate(content)])

basic_search_retriever_prompt = """
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
If it is a writing task or a simple hi, hello rather than a question, you need to return `not_needed` as the response. Just output a string only, no furthur explain!

Example:
1. Follow up question: What is the capital of France?
Rephrased: Capital of france

2. Follow up question: What is the population of New York City?
Rephrased: Population of New York City

3. Follow up question: What is Docker?
Rephrased: What is Docker

Conversation:
{chat_history}

Follow up question: {query}
Rephrased question:
"""

basic_web_search_response_prompt = """
    You are Naruto QA, an AI model who is comic/manga expert at searching the web and answering user's queries.
    Given this query: {query}
    Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
    You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
    You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
    Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
    You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
    Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
    However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.

    Aything inside the following \`context\` HTML block provided below is for your knowledge returned by the search engine and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to 
    talk about the context in your response. 

    <context>
    {context}
    </context>

    If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
    Anything between the \`context\` is retrieved from a search engine and is not a part of the conversation with the user
"""

background_web_search_response_prompt = """You are Naruto QA, an AI model who is comic/manga expert at searching the web and answering user's queries.
### Instruction
User query: {query}. Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page)
### Guidelines
- You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
- You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
- Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
- You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
- Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
- However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
- If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.

### Input
#### Background
{background}
#### Context
{context}

### Output

"""

def refine_query(llm, query, chat_history=None):
    prompt = basic_search_retriever_prompt.format(
        chat_history=chat_history, query=query
    )
    return llm.invoke(prompt)


def summarize_websearch_conversation(
    llm, query, context, chunk_size: int = 2048, stream: bool = False
):
    prompt = basic_web_search_response_prompt.format(context=context, query=query)
    prompt = Document(page_content=prompt)
    # spliter = TokenTextSplitter(chunk_size=chunk_size)
    # doc = spliter.create_documents([prompt])
    # print(2)
    # split_docs = spliter.split_documents(doc)
    # prompt_template = split_docs[0]
    print("Content: ", prompt.page_content)
    if not stream:
        return iter([llm.invoke(prompt.page_content)])

    return llm.stream(prompt.page_content)


In [3]:
EVALUATION_PROMPT_TEMPLATE = """
# Instructions:
You will be given one answer written for an retrieval documents to answer the question {query}. Your task is to rate the answer on one metric.
Please make sure you read and understand these instructions very carefully. 
Please keep this document open while reviewing, and refer to it as needed.

# Evaluation Criteria:
{criteria}

# Evaluation Steps:
{steps}

## Example:

##Context
{document}

## Answer
{answer}

Evaluation Form (scores ONLY):
- {metric_name}
"""
RELEVANCY_SCORE_CRITERIA = """
Relevance (1-5) 
- Answer should be accuracy and relevant to the question. \
- Annotators were instructed to penalize answers which contained redundancies, excess information and not covering the main points of the question.
"""

RELEVANCY_SCORE_STEPS = """
1. Read the answer and the source documents carefully.
2. Compare the answer to the source documents and identify the main points of the questions.
3. Assess how well the answer covers the main points of the question, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 5.
"""

In [33]:
from langchain_community.chat_models import ChatOpenAI

# Initialize the Elasticsearch client
INDEX_NAME = "naruto_episode"
WIKI_INDEX_NAME = "naruto_wiki"
EPISODE_INDEX_NAME = "naruto_episode"

es = Elasticsearch(
    [
        {
            "host": os.environ.get("ES_HOST", "localhost"),
            "port": os.environ.get("ES_PORT", 9200),
        }
    ]
)

base_url, model, api_key = (
    "http://localhost:1234/v1",
    "QuantFactory/Phi-3-mini-128k-instruct-GGUF/Phi-3-mini-128k-instruct.Q8_0.gguf",
    "as",
)
# base_url, model, api_key = "https://api.groq.com/openai/v1", "llama3-8b-8192"
llm_query = ChatOpenAI(base_url=base_url, api_key=api_key, model=model)
llm_rag = ChatOpenAI(base_url=base_url, api_key=api_key, model=model)


def search_as_you_type(query):
    # Elasticsearch query for autocomplete
    body = {
        "suggest": {
            "query_suggest": {
                "prefix": query.lower(),
                "completion": {"field": "title.suggest"},
            }
        }
    }
    response = es.search(index=INDEX_NAME, body=body, _source=["title"])
    suggestions = [
        option["text"] for option in response["suggest"]["query_suggest"][0]["options"]
    ]
    return list(set(suggestions))


def search_episode(query):
    # Elasticsearch query for final search
    body = {"query": {"multi_match": {"query": query.lower(), "fields": ["content"]}}}
    response = es.search(index=EPISODE_INDEX_NAME, body=body)
    results = response["hits"]["hits"]
    return [result["_source"] for result in results]


def search_wiki(query):
    # Elasticsearch query for final search
    doc = nlp(query)
    entities = [ent.text for ent in doc.ents]
    match_entities = []
    for entity in entities:
        match_entities.append({"match": {"title": entity}})
    body = {"query": {"bool": {"should": []}}}
    response = es.search(index=WIKI_INDEX_NAME, body=body)
    results = response["hits"]["hits"]
    return [result["_source"] for result in results]


def update_suggestions(query):
    suggestions = search_as_you_type(query)
    return "\n".join(suggestions)


def search_results(query, return_refined_query=False):
    refined_query = refine_query(llm=llm_query, query=query).content.strip()
    wiki_backgrounds = search_wiki(refined_query)
    episode_references = search_episode(refined_query)
    if return_refined_query:
        return refined_query, wiki_backgrounds, episode_references
    return wiki_backgrounds, episode_references

In [5]:
client = openai.OpenAI(base_url=base_url, api_key=api_key)

In [11]:
# Load questions
import random
import json
df_qa = pd.read_parquet("../data/anime_stackexchange/anime_question_answers.parquet")
df_qa = df_qa.groupby(["title"]).agg({"body": list}).reset_index().sample(n=20, random_state=4241)
df_qa["total_answers"] = df_qa["body"].apply(lambda x: len(x))

In [12]:
df_qa

Unnamed: 0,title,body,total_answers
712,Shouldn&#39;t Pain&#39;s head have melted off ...,[<p>That's probably just because Pain needed t...,3
632,Is the Shinigami dead?,[<p><strong>Shinigami</strong> means '<strong>...,2
765,What are the limitations of the Flying Thunder...,"[<p><img src=""https://img3.wikia.nocookie.net/...",1
1170,Why did Itachi say that he needs to get someon...,[<p>Because he actually didn't want them. He w...,1
1374,Why is Killer Bee still alive?,"[<p>According to this <a href=""http://naruto.w...",3
1036,Where did Jiraiya obtain important information?,[<p>Jiraiya has a spy network. His spy network...,1
751,What Mangekyo Sharingan power does Itachi have?,[<p>Mangekyou Sharingan users have three abili...,4
168,"Did Kakashi eventually surpass Itachi, Jiraiya...",[<p><strong>Taijutsu</strong></p>\n\n<p>Jiraiy...,13
1371,Why is Jigen&#39;s ten-tails smaller than Eart...,"[<p>Basically, Jigen is the host of Isshiki Ot...",1
1081,Who are all members of Namikaze clan and is Na...,[<p>Namikaze isn't a descendant of Senju. I th...,2


In [13]:
qa_row = df_qa.sample(1).iloc[0].to_dict()
print(qa_row)

{'title': 'How Orochimaru is still mentally active after being stabbed by Totsuka sword?', 'body': ['<p>The jutsu which Orochimaru used is called Juinjutsu. Orochimaru was unable to use Sage Mode and thus <a href="http://naruto.wikia.com/wiki/Orochimaru%27s_Juinjutsu" rel="nofollow">had to create an alternative way to make use of Senjutsu</a>.</p>\n\n<blockquote>\n  <p>In addition to his various test subjects, Orochimaru gave cursed seals to some of his most powerful and unique followers, either to enhance their abilities or to prepare them for becoming potential host bodies. To apply a cursed seal, Orochimaru bites the recipient, doing so with the aid of his sharp fangs and extendible neck. The seal then appears on the body of the victim near the puncture wound before causing them to lose consciousness.</p>\n</blockquote>\n\n<p>Now try recalling the concept of Bunshin no Jutsu. When clones are created, it has mind of their own. As a result, when it is released, the original would get 

In [15]:
query = qa_row["title"]
print(query)
wiki, refs = search_results(query)

How Orochimaru is still mentally active after being stabbed by Totsuka sword?


  response = es.search(index=WIKI_INDEX_NAME, body=body)
  response = es.search(index=EPISODE_INDEX_NAME, body=body)


'Orochimaru'

In [29]:
ent.label_

'ORG'

In [17]:
top_k = 3
wiki_content = create_numbered_list([row["content"] for row in wiki[:top_k]])
episode_content = create_numbered_list_with_episode(refs[:top_k])

final_prompt = background_web_search_response_prompt.format(
    query=query,
    background=wiki_content,
    context=episode_content
)
print(get_num_tokens(final_prompt))
# print(final_prompt)

3007.2


In [18]:
print(final_prompt)

You are Naruto QA, an AI model who is comic/manga expert at searching the web and answering user's queries.
### Instruction
User query: How Orochimaru is still mentally active after being stabbed by Totsuka sword?. Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page)
### Guidelines
- You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
- You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
- Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is 

In [149]:
print(output.choices[0].message.content)

 - The most powerful summoning technique discussed in the context provided appears to be the Summoning Technique, which allows a user to transport an animal or entity to their location by forming a contract with it. This is elaborated on through various instances such as Naruto's ability to summon animals like dogs and cats for assistance, Sasuke Uchiha summoning snakes and hawks, and the fact that these contracts can be used even after death or by using another person's blood. It is a versatile technique with significant strategic applications in combat and other missions (Context 2).

- Another potent jutsu mentioned is the Demonic Statue of the Outer Path, which can only be summoned by someone with the Rinnegan. This powerful entity was used by Hagoromo Ōtsutsuki to summon and control multiple creatures as well as five individuals from Kaguya's Dimensions (Context 6).

- The Mysterious Peacock Method is highlighted for its unique ability, enhancing the user's chakra significantly. T

In [245]:
def search_and_answer(
    question,
    model_name,
    wiki_top_k: int = 5,
    context_top_k: int = 5,
    temperature: float = 0.7,
    return_refined_query=True,
):
    if return_refined_query:
        question, wiki, refs = search_results(
            question, return_refined_query=return_refined_query
        )
    else:
        wiki, refs = search_results(question, return_refined_query=return_refined_query)

    wiki_content = create_numbered_list([row["content"] for row in wiki[:wiki_top_k]])
    episode_content = create_numbered_list_with_episode(refs[:context_top_k])

    final_prompt = background_web_search_response_prompt.format(
        query=query, background=wiki_content, context=episode_content
    )
    output = client.chat.completions.create(
        messages=[
            {"role": "user", "content": final_prompt},
        ],
        temperature=temperature,
        model=model_name,
    )
    return {
        "answer": output.choices[0].message.content,
        "context": (wiki_content, episode_content),
        "question": question,
    }

In [249]:
df_qa.shape

(20, 3)

In [250]:
from tqdm import trange, tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

answers = []
CONFIG = [
    ("QuantFactory/Phi-3-mini-128k-instruct-GGUF/Phi-3-mini-128k-instruct.Q8_0.gguf", 2, 2, 0.7),
]
def task(task_id, query, model_name, wiki_top_k, context_top_k, temperature):
    result = {"task_id": task_id,}
    output = search_and_answer(query, model_name, wiki_top_k, context_top_k, temperature)
    result.update(output)
    return result

executor = ThreadPoolExecutor(max_workers=4)
futs= []
for i in range(len(df_qa)):
    qa_row = df_qa.iloc[i].to_dict()
    query = qa_row["title"]
    fut = executor.submit(task, i, query, *CONFIG[0]) 
    futs.append(fut)

with tqdm(total=len(futs)) as pbar, open("answer_v1.jsonl", "w") as f:
    for fut in as_completed(futs):
        res = fut.result()
        answers.append(res)
        f.write(json.dumps(res) + "\n")
        pbar.update(1)


100%|██████████| 20/20 [00:00<00:00, 3357.86it/s]
  response = es.search(index=WIKI_INDEX_NAME, body=body)
  response = es.search(index=EPISODE_INDEX_NAME, body=body)
  response = es.search(index=WIKI_INDEX_NAME, body=body)
  response = es.search(index=EPISODE_INDEX_NAME, body=body)
  response = es.search(index=WIKI_INDEX_NAME, body=body)
  response = es.search(index=EPISODE_INDEX_NAME, body=body)
  response = es.search(index=WIKI_INDEX_NAME, body=body)
  response = es.search(index=EPISODE_INDEX_NAME, body=body)
  response = es.search(index=WIKI_INDEX_NAME, body=body)
  response = es.search(index=EPISODE_INDEX_NAME, body=body)
100%|██████████| 20/20 [11:37<00:00, 34.86s/it]


In [251]:
bleu = evaluate.load("bleu", num_process=1, keep_in_memory=True)
rouge = evaluate.load("rouge", num_process=1, keep_in_memory=True)
def compute_metrics(predictions, references, ngrams=4):
    bleu_score = bleu.compute(predictions=predictions, references=references, max_order=ngrams)
    rouge_score = rouge.compute(predictions=predictions, references=references)
    return bleu_score, rouge_score

scores = {"bleu": [], "rouge": []}
predictions, references = [], []
for i in trange(len(answers)):
    answer = answers[i]
    qa_row = df_qa.iloc[i].to_dict()
    predictions.append(answer["answer"])
    references.append(qa_row["body_markdown"])

bleu_score, rouge_score = compute_metrics(predictions, references)
   
print("BLEU: ", bleu_score)
print("ROUGE: ", rouge_score)
    

100%|██████████| 20/20 [00:00<00:00, 4688.21it/s]


BLEU:  {'bleu': 0.016811138173417034, 'precisions': [0.2672709456148947, 0.04015225933202358, 0.00603151157065485, 0.0012339585389930898], 'brevity_penalty': 1.0, 'length_ratio': 2.0328685258964145, 'translation_length': 8164, 'reference_length': 4016}
ROUGE:  {'rouge1': 0.23534731997506148, 'rouge2': 0.02032480266600484, 'rougeL': 0.10934406976577495, 'rougeLsum': 0.19847040193220555}


In [252]:
answers[0]

{'task_id': 3,
 'answer': ' Orochimaru targeted Sasuke rather than Naruto due to several factors highlighted in various episodes:\n\n- **Orochimaru\'s Previous Interest**: In Episode 337, it is revealed that Orochimaru had a significant history with Kabuto Yakushi and was reborn as him. This connection implies an established interest or potential influence over Sasuke (context [1]).\n\n- **Sasuke\'s Unique Abilities**: Itachi Uchiha showcased his Izanami jutsu, a powerful technique tied to fate manipulation, which he used against Sasuke. This indicates that Sasuke has unique abilities that might be more valuable or interesting for Orochimaru\'s goals (context [2]).\n\n- **Sasuke\'s Strength and Potential**: During their intense battle in Episode 337, Itachi pointed out the immense power of Sasuke by observing his Susano\'o summoning. This suggests that Sasuke possesses considerable strength and potential that could be intriguing for Orochimaru (context [2]).\n\n- **Orochimaru\'s Strate

# Evaluation by GPT3

In [297]:
recall_prompt = """A chat history between user and bot is shown below
A list of documents is shown below in text, and each document has one unique id. 
These listed documents are used as context to answer the given question.
The task is to score the relevance between the documents and the potential answer to the given question in the range of 1 to 5. 
1 means none of the documents is relevant to the question at all. 5 means either one of the document or combination of a few documents is ideal for answering the given question.
Think through step by step:
- Summarize each given document first
- Determine the underlying intent of the given question, when the question is ambiguous, refer to the given chat history 
- Measure how suitable each document to the given question, list the document id and the corresponding relevance score. 
- Summarize the overall relevance of given list of documents to the given question after # Overall Reason, note that the answer to the question can soley from single document or a combination of multiple documents. 
- Your answer about relevance should follow the format:
    - overall score:
    - detail:
        - reasoning
        - score
        - document_id 

# Question
{query}
# Chat History
{history}
# Documents
---BEGIN RETRIEVED DOCUMENTS---
{FullBody}
---END RETRIEVED DOCUMENTS---
### Output
Return in YAML format only, no further explained
"""
precision_prompt = """
You will be provided a question, a conversation history, context documents related to the question and a response to the question in the domain. You task is to evaluate the quality of the provided response by following the steps below:
- Understand the context of the question based on the conversation history and context documents.
- Compare the provided answers with ground truth answers and rate the quality of the provided response based on the reference answer.
- You need to rate the provided response according to the ground truthif it's available on a scale of 1 (poor) to 5 (excellent), based on the below criteria:
    - 5 - Ideal: The provided response includes all information necessary to answer the question based on the reference answer and conversation history. Please be strict about giving a 5 score.
    - 4 - Mostly Relevant: The provided response is mostly relevant, although it may be a little too narrow or too broad based on the reference answer and conversation history.
    - 3 - Somewhat Relevant: The provided response may be partly helpful but might be hard to read or contain other irrelevant content based on the reference answer and conversation history.
    - 2 - Barely Relevant: The provided response is barely relevant, perhaps shown as a last resort based on the reference answer and conversation history.
    - 1 - Completely Irrelevant: The provided response should never be used for answering this question based on the reference answer and conversation history.
- You need to rate the provided response to be 5, if the reference answer can not be generated since no relevant documents were retrieved.
- You need to first provide a scoring reason for the evaluation according to the above criteria, and then provide a score for the quality of the provided response.
- You need to translate the provided response into English if it's in another language. 
- Your answer should follow the YAML format: 
    reason:
    score:
# Question
{query}
# Chat Response
{reply}
# Chat History
{history}
# Documents
---BEGIN RETRIEVED DOCUMENTS---
{FullBody}
---END RETRIEVED DOCUMENTS---
### Output
Return in YAML format only, no further explained
"""
factualness_prompt = """
Your task is to check and rate if factual information in chatbot's reply is all grounded to retrieved documents.
You will be given a question, chatbot's response to the question, a chat history between this chatbot and human, and a list of retrieved documents.
The chatbot must base its response exclusively on factual information extracted from the retrieved documents, utilizing paraphrasing, summarization, or inference techniques. When the chatbot responds to information that is not mentioned in or cannot be inferred from the retrieved documents, we refer to it as a grounded issue.

To rate the groundness of chat response, follow the below steps:
1. Review the chat history to understand better about the question and chat response
2. Look for all the factual information in chatbot's response 
3. Compare the factual information in chatbot's response with the retrieved documents. Check if there are any facts that are not in the retrieved documents at all,or that contradict or distort the facts in the retrieved documents. If there are, write them down. If there are none, leave it blank. Note that some facts may be implied or suggested by the retrieved documents, but not explicitly stated. In that case, use your best judgment to decide if the fact is grounded or not. 
   For example, if the retrieved documents mention that a film was nominated for 12 Oscars, and chatbot's reply states the same, you can consider that fact as grounded, as it is directly taken from the retrieved documents. 
   However, if the retrieved documents do not mention the film won any awards at all, and chatbot reply states that the film won some awards, you should consider that fact as not grounded.
4. Rate how well grounded the chatbot response is on a Likert scale from 1 to 5 judging if chatbot response has no ungrounded facts. (higher better)
   5: agree strongly
   4: agree
   3: neither agree or disagree
   2: disagree
   1: disagree strongly
   If the chatbot response used information from outside sources, or made claims that are not backed up by the retrieved documents, give it a low score. 
5. Your answer should follow the YAML format: 
    reason:
    score:

# Question
{query}
# Chat Response
{reply}
# Chat History
{history}
# Documents
---BEGIN RETRIEVED DOCUMENTS---
{FullBody}
---END RETRIEVED DOCUMENTS---
"""

In [298]:
import tiktoken 
model_name = "gpt-3-turbo"
# encoding = tiktoken.encoding_name_for_model(model_name)

gpt_recall_prompts = []
gpt_precision_prompts = []
gpt_factualness_prompts = []

for i in range(len(answers)):
    answer = answers[i]
    qa_row = df_qa.iloc[i].to_dict()
    query = qa_row["title"]
    gpt_recall_prompts.append(recall_prompt.format(query=query, history="", FullBody=answers[0]["context"][0] + "\n" + answers[0]["context"][1]))
    gpt_precision_prompts.append(precision_prompt.format(query=query, reply=answer["answer"], history="", FullBody=answers[0]["context"][0] + "\n" + answers[0]["context"][1]))
    gpt_factualness_prompts.append(factualness_prompt.format(query=query, reply=answer["answer"], history="", FullBody=answers[0]["context"][0] + "\n" + answers[0]["context"][1]))

print("Total tokens: ", sum([get_num_tokens(p) for p in gpt_recall_prompts + gpt_precision_prompts + gpt_factualness_prompts]))

Total tokens:  113102.40000000001


In [302]:
import dotenv
import yaml
import openai
from functools import lru_cache

dotenv.load_dotenv()

client = openai.OpenAI()


In [314]:
recalls = []
precisions = []
factualness = []

@lru_cache(maxsize=10000)
def generate_yaml_format(prompt: str, model_name: str = "gpt-3.5-turbo", temperature: float = 0.3):
    for _ in range(3):
        try:
            output = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt
                    },
                ],
                model=model_name,
                temperature=temperature,
                logprobs=False,
            )
            content = yaml.safe_load(output.choices[0].message.content.replace("```yaml", "").replace("```", ""))
            return content
        except yaml.YAMLError:
            continue
    
    raise Exception("Failed to generate response")


for i in trange(len(gpt_recall_prompts)):
    recall_prompt = gpt_recall_prompts[i]
    precision_prompt = gpt_precision_prompts[i]
    factualness_prompt = gpt_factualness_prompts[i]
    recall = generate_yaml_format(recall_prompt)
    precision = generate_yaml_format(precision_prompt)
    fact = generate_yaml_format(factualness_prompt)
    recalls.append(recall)
    precisions.append(precision)
    factualness.append(fact)

100%|██████████| 20/20 [01:49<00:00,  5.45s/it]


In [321]:
print("Recall: {:.2f}".format(sum([r["overall score"] for r in recalls]) / len(recalls)))
print("Precision: {:.2f}".format(sum([r["score"] for r in precisions]) / len(recalls)))
print("Factual: {:.2f}".format(sum([r["score"] for r in factualness]) / len(recalls)))

Recall: 1.55
Precision: 1.75
Factual: 3.05


In [1]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = datasets.load_from_disk("../data/anime_stackexchange/anime_answers_pair")

In [6]:
ds[-3]

{'question_id': 66456,
 'question': 'Why wasn&#39;t flying Raijin passed onto other shinobi by second or fourth hokage?',
 'response_j': 'You have to understand that not all people can master all jutsus. They have their limitation no matter how prodigious they are. Tobirama and Minato can pull off that Jutsu because they were the fastest shinobis and had insane reflexes. Their extensive use of this jutsu had earned Minato the nickname of "Konoha\'s Yellow Flash".\n\nMinato would randomly appear behind enemy lines and, by seemingly being in many places at once, wipe out whole squadrons before they had time to react. By combining their naturally good reflexes with the instantaneousness of the Flying Thunder God, Tobirama could intercept and redirect a Truth-Seeking Ball mid-detonation and Minato could escape Kamui mid-capture. Minato\'s ability to dodge A\'s Lightning Release Chakra Mode led A to dub Minato the fastest ninja who ever lived.\nFlying Thunder God Technique\n\nOf course, Min