In [2]:
import json
import os
import pandas as pd
from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import pipeline
from datasets import Dataset
from huggingface_hub import login
from dotenv import load_dotenv

In [None]:
# in order to load the model to hugging face we need the personal token
# This is also need when loading the model
# Uncomment if you are using Google Colab
# from google.colab import userdata
#
# login(token=userdata.get('HF_TOKEN'))
load_dotenv()  # Load variables from the .env file
secret_token = os.getenv("HF_TOKEN")
login(token=secret_token)

In [3]:
# Load the test set
with open('../data/test_TLQA.json', 'r') as f:
    test_data = json.load(f)

# Load the train set
with open('../data/train_TLQA.json', 'r') as f:
    train_data = json.load(f)

# Get the necessary details from the input
def prepare_dataset(data):
    questions, answers = [], []
    for item in data:
        question = item['question']
        answer = '; '.join(item['final_answers'])
        questions.append(question)
        answers.append(answer)
    return {"question": questions, "answer": answers}

test_dataset_dict = prepare_dataset(test_data)
test_dataset = Dataset.from_dict(test_dataset_dict)

train_dataset_dict = prepare_dataset(train_data)
train_dataset = Dataset.from_dict(train_dataset_dict)

# Create a new DataFrame
df = pd.DataFrame()
df["question"] = test_dataset_dict["question"]
df["answer"] = test_dataset_dict["answer"]

In [None]:
hub_repo = "Ana091202/flan_t5_timeline_qa"
# hub_repo = "Ana091202/flan_t5_base_timeline_qa"
model = "large"

tokenizer = AutoTokenizer.from_pretrained(hub_repo)
finetuned_model = T5ForConditionalGeneration.from_pretrained(hub_repo)

pipe = pipeline(
    "text2text-generation",
    model=finetuned_model, max_new_tokens=256,
    tokenizer=tokenizer,
    do_sample=True,
    temperature=1e-5,
    top_p=0.9,
    device=0
)

results = []

# Prompt for each instance in the test set
for prompt_question in df["question"]:
    # Select few shot examples using the knn classifier
    prompt = f"""
    Generate a timeline-based answer for the following question by listing all entities, events, or attributes associated with the specified subject and time range. Ensure the output is in a structured, ordered format that reflects the temporal sequence accurately.

    Question: {prompt_question}
    Answer:
    """

    generated_response = pipe(prompt.strip())[0]['generated_text']

    results.append({
        "question": prompt_question,
        "response": generated_response,
    })

results_df = pd.DataFrame(results)

results_df.to_csv(f"zero_shot_{model}_finetuned.csv", index=False)