In [2]:
import json
import os
import pandas as pd
from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import pipeline
from datasets import Dataset
from huggingface_hub import login
from dotenv import load_dotenv

In [None]:
# in order to load the model to hugging face we need the personal token
# This is also need when loading the model
# Uncomment if you are using Google Colab
# from google.colab import userdata
#
# login(token=userdata.get('HF_TOKEN'))
# load_dotenv()  # Load variables from the .env file
secret_token = os.getenv("HF_TOKEN")
login(token=secret_token)

In [3]:
# Load the test set
with open('../data/test_TLQA.json', 'r') as f:
    test_data = json.load(f)

# Load the train set
with open('../data/train_TLQA.json', 'r') as f:
    train_data = json.load(f)

# Get the necessary details from the input
def prepare_dataset(data):
    questions, answers = [], []
    for item in data:
        question = item['question']
        answer = '; '.join(item['final_answers'])
        questions.append(question)
        answers.append(answer)
    return {"question": questions, "answer": answers}

test_dataset_dict = prepare_dataset(test_data)
test_dataset = Dataset.from_dict(test_dataset_dict)

train_dataset_dict = prepare_dataset(train_data)
train_dataset = Dataset.from_dict(train_dataset_dict)

# Create a new DataFrame
df = pd.DataFrame()
df["question"] = test_dataset_dict["question"]
df["answer"] = test_dataset_dict["answer"]

In [6]:
from components.knn import KNN

k_value = 3
knn_classifier = KNN(k=k_value)

# Train the knn classifier on the train data
knn_classifier.fit(train_dataset_dict["question"], train_dataset_dict["answer"])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
hub_repo = "Ana091202/flan_t5_timeline_qa" # this is the large model
# hub_repo = "Ana091202/flan_t5_base_timeline_qa"
model = "large"

tokenizer = AutoTokenizer.from_pretrained(hub_repo)
finetuned_model = T5ForConditionalGeneration.from_pretrained(hub_repo)

pipe = pipeline(
    "text2text-generation",
    model=finetuned_model, max_new_tokens=256,
    tokenizer=tokenizer,
    do_sample=True,
    temperature=1e-5,
    top_p=0.9,
    device=0
)

results = []

# Prompt for each instance in the test set
for prompt_question in df["question"]:
    # Select few shot examples using the knn classifier
    closest_prompts = knn_classifier.query(prompt_question)
    few_shot_examples = examples = "\n".join(
        f"Example:\n  Question: {closest_prompts_question}\n  Answer: {closest_prompts_answers}" for closest_prompts_question, closest_prompts_answers in closest_prompts
    )

    prompt = f"""
    Generate a timeline-based answer for the following question by listing all entities, events, or attributes associated with the specified subject and time range. Ensure the output is in a structured, ordered format that reflects the temporal sequence accurately.

    {few_shot_examples}

    Now, answer the following question:
    Question: {prompt_question}
    Answer:
    """

    generated_response = pipe(prompt.strip())[0]['generated_text']

    results.append({
        "question": prompt_question,
        "response": generated_response,
        "examples": few_shot_examples
    })

results_df = pd.DataFrame(results)

results_df.to_csv(f"few_shot_knn_{k_value}_{model}_finetuned.csv", index=False)