# Exercício 8_9 Visconde

matheusrdgsf@gmail.com / mrsf@cin.ufpe.br

In [1]:
# !pip install groq

In [2]:
from datasets import Dataset, load_dataset, concatenate_datasets
import os
import getpass
from groq import Groq
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from tqdm.notebook import tqdm
import pandas as pd
import string
from typing import List
import matplotlib.pyplot as plt
import seaborn as sns
import json
import tarfile
from bs4 import BeautifulSoup
import spacy
from sentence_transformers import SentenceTransformer
import torch
import faiss

In [3]:
if spacy.util.is_package("en_core_web_sm"):
    spacy_model = spacy.load("en_core_web_sm")
else:
    !python -m spacy download en_core_web_sm
    spacy_model = spacy.load("en_core_web_sm")

### Parameters

In [4]:
N_QUESTIONS = 150

### LLM Inferecene

In [34]:
GROQ_KEY = os.getenv("GROQ_KEY", getpass.getpass("Enter your Groq API key: "))
client = Groq(
    api_key=GROQ_KEY,
)
MODELS = ["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"]

In [35]:
def predict_groq(text, retry=10):

    for _ in range(retry):
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": f"{text}",
                    }
                ],
                model=MODELS[0],
                seed=42,
                temperature=0,
            )

            return chat_completion.choices[0].message.content
        except Exception as e:
            print(e)
            pass

    return "Fail in GROQ API."

### Download Data

In [7]:
if not os.path.exists("data"):

    os.makedirs("data")

    !wget https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_train_dev.tgz -P data
    !wget https://iirc-dataset.s3.us-west-2.amazonaws.com/context_articles.tar.gz -P data
    !wget https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_test.json -P data

    # Extract the data with tarfile
    with tarfile.open("data/iirc_train_dev.tgz", "r:gz") as tar:
        tar.extractall("data")

    with tarfile.open("data/context_articles.tar.gz", "r:gz") as tar:
        tar.extractall("data")

    os.remove("data/iirc_train_dev.tgz")
    os.remove("data/context_articles.tar.gz")

print("Data downloaded and extracted successfully!")

Data downloaded and extracted successfully!


### Load data

In [8]:
dev_data = load_dataset("json", data_files="data/iirc_train_dev/dev.json")
test_data = load_dataset("json", data_files="data/iirc_test.json")

In [9]:
articles = json.load(open("data/context_articles.json"))

### Process data

In [10]:
test_data = test_data["train"].select(range(N_QUESTIONS))
dev_data = dev_data["train"]

In [11]:
# Lower case and remove HTML tags
test_data = test_data.map(
    lambda x: {
        "text": BeautifulSoup(x["text"], "html.parser").get_text().lower(),
        "links": list(
            map(
                lambda x: {"indices": x["indices"], "target": x["target"].lower()},
                x["links"],
            )
        ),
        "title": x["title"].lower(),
    }
)

### Get Docs

In [12]:
all_titles_dict = {i: j for i, j in zip(test_data["title"], test_data["text"])}

print(len(all_titles_dict))

for item in test_data["links"]:
    for target in item:
        if target["target"] not in all_titles_dict and target["target"] in articles:
            content = (
                BeautifulSoup(articles[target["target"]], "html.parser")
                .get_text()
                .lower()
            )

            all_titles_dict[target["target"]] = content

print(len(all_titles_dict))

150
2164


In [13]:
titles, texts = zip(*all_titles_dict.items())

### Sentence Split

In [14]:
from tqdm.auto import tqdm

if os.path.exists("data/contents_list.csv"):
    contents_list = pd.read_csv("data/contents_list.csv")["content"].tolist()
else:
    contents_list = [
        sent.text
        for text in tqdm(texts, desc="Processing texts")
        for sent in spacy_model(text).sents
    ]
    pd.DataFrame({"content": contents_list}).to_csv(
        "data/contents_list.csv", index=False
    )

### Format questions


In [15]:
questions_to_ask = []

for i in range(len(test_data)):
    pr = test_data[i]["questions"][0]
    question = pr["question"]
    answer = pr["answer"]
    answer_type = answer["type"]

    if answer_type == "binary" or answer_type == "value":
        final_answer = answer["answer_value"]
    elif answer_type == "span":
        final_answer = answer["answer_spans"][0]["text"]
    elif answer_type == "none":
        final_answer = "none"
    else:
        final_answer = "An error perhaps, bad type"
        print(answer_type)

    questions_to_ask.append({"question": question, "answer": final_answer})

In [16]:
questions_to_ask[:3]

[{'question': 'What is Zeus know for in Greek mythology?',
  'answer': 'sky and thunder god'},
 {'question': 'How long had the First World War been over when Messe was named aide-de-camp?',
  'answer': '5'},
 {'question': 'How long had Angela Scoular been acting professionally when she appeared in the movie "On Her Majesty\'s Secret Service"?',
  'answer': '2'}]

### Indexing Dataset

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [44]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
model.to(device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [46]:
if os.path.exists("data/embeddings.pt"):
    embeddings = torch.load("data/embeddings.pt")
else:
    embeddings = model.encode(contents_list, show_progress_bar=True)
    torch.save(embeddings, "data/embeddings.pt")

Batches:   0%|          | 0/9036 [00:00<?, ?it/s]

### Indexing Faiss

In [47]:
if os.path.exists("data/index.bin"):
    index = faiss.read_index("data/index.bin")
else:
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, "data/index.bin")

### Test

In [48]:
# Agradecimento Fabio Grasiotto
def SentenceTransformer_getContext(question, base, k):

    xq = model.encode([question])

    _, I = index.search(xq, k)

    str = ""

    for i in range(0, k):

        str = str + base[I[0][i]] + "\n"

    return str


input_sequence = questions_to_ask[10].get("question")


print(input_sequence, "\n")

print(SentenceTransformer_getContext(input_sequence, contents_list, 5))

How much money did IBM earn the year Delicious was founded? 

finance.for the fiscal year 2017, ibm reported earnings of us$5.7 billion, with an annual revenue of us$79.1 billion, a decline of 1.0% over the previous fiscal cycle.
the company also sponsored the olympic games from 1960–2000, and the national football league from 2003–2012.

in 2012, ibm's brand was valued at $75.5 billion and ranked by interbrand as the second-best brand worldwide.
ibm is also a major research organization, holding the record for most u.s. patents generated by a business () for 26 consecutive years.
ibm has a valuable brand as a result of over 100 years of operations and marketing campaigns.
in march 2005, he left his day job to work on delicious full-time, and in april 2005 it received approximately $2 million in funding from investors including union square ventures and amazon.com.





### Evaluation comparing with Llama3-70b
https://github.com/neuralmind-ai/visconde/blob/main/qasper_evaluator.py

In [49]:
from collections import Counter
import re


def normalize_answer(s):
    """
    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
    Lower text and remove punctuation, articles and extra whitespace.
    """

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def token_f1_score(prediction, ground_truth):
    """
    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
    """
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [50]:
def create_llm_prompt(question):
    prompt_template = """\
Based on the following context, answer the following question. If the answer is not present in the context, please answer "none".
Context:\n{context}Question: {question}\nAnswer:"""

    context = SentenceTransformer_getContext(question, contents_list, 5)

    prompt_template = prompt_template.format(
        context=context.replace("\n\n", ""), question=question
    )

    return prompt_template

In [51]:
print(create_llm_prompt(questions_to_ask[1]["question"]))

Based on the following context, answer the following question. If the answer is not present in the context, please answer "none".
Context:
first world war.over
he was a brevet colonel at the end of the war.
it was important during the english civil war and was the site of a prisoner of war camp during the first world war.
inter-war period.
the camps were abolished after world war ii.
Question: How long had the First World War been over when Messe was named aide-de-camp?
Answer:


In [52]:
import pandas as pd

results = []

for item in tqdm(questions_to_ask, desc="Processing questions"):

    question = item.get("question")

    answer = normalize_answer(item.get("answer"))

    model_answer = normalize_answer(predict_groq(create_llm_prompt(question)))

    f1_score = token_f1_score(model_answer, answer)

    exact_match = 1 if model_answer == answer else 0

    results.append(
        {
            "question": question,
            "answer": answer,
            "model_answer": model_answer,
            "f1_score": f1_score,
            "exact_match": exact_match,
        }
    )

Processing questions:   0%|          | 0/150 [00:00<?, ?it/s]

In [53]:
results_df = pd.DataFrame(results)

In [54]:
results_df.head(50)

Unnamed: 0,question,answer,model_answer,f1_score,exact_match
0,What is Zeus know for in Greek mythology?,sky and thunder god,zeus is known for his erotic escapades,0.0,0
1,How long had the First World War been over whe...,5,none,0.0,0
2,How long had Angela Scoular been acting profes...,2,25 years,0.0,0
3,What is the capacity of the stadium where Brun...,26688,none,0.0,0
4,In which country was Wilhelm Müller born?,germany,germany,1.0,1
5,In which Italian region did Pesce studied medi...,liguria,liguria,1.0,1
6,"What albums were ranked higher than ""It Takes ...",none,none,1.0,1
7,When was the sports organization that the Turk...,1909,none,0.0,0
8,When was the port established at the Port Phil...,none,none,1.0,1
9,"At which tournament were more goals scored, 19...",none,none,1.0,1


In [55]:
# mean f1 score and exact match
results_df[["f1_score", "exact_match"]].describe()

Unnamed: 0,f1_score,exact_match
count,150.0,150.0
mean,0.324852,0.273333
std,0.443616,0.447164
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,1.0
max,1.0,1.0


In [56]:
results_df.to_csv("results.csv", index=False)