In [None]:
import json
import os
import pandas as pd
from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import pipeline
from datasets import Dataset
from huggingface_hub import login
from tqdm import tqdm

In [None]:
# In order to load the model to hugging face we need the personal token
# This is also needed when loading the model
# Uncomment if you are using Google Colab
# from google.colab import userdata

secret_token = os.getenv("HF_TOKEN")
login(token=secret_token)

In [None]:
# Load the test set
with open('../data/test_TLQA.json', 'r') as f:
    test_data = json.load(f)

# Load the train set
with open('../data/train_TLQA.json', 'r') as f:
    train_data = json.load(f)

# Load the contexts
with open('../data/preprocessed_contexts.json', 'r') as f:
    context_data = json.load(f)

# Get the necessary details from the input
def prepare_dataset(data):
    questions, answers = [], []
    for item in data:
        question = item['question']
        answer = '; '.join(item['final_answers'])
        questions.append(question)
        answers.append(answer)
    return {"question": questions, "answer": answers}

test_dataset_dict = prepare_dataset(test_data)
test_dataset = Dataset.from_dict(test_dataset_dict)

train_dataset_dict = prepare_dataset(train_data)
train_dataset = Dataset.from_dict(train_dataset_dict)

# Create a new DataFrame
df = pd.DataFrame()
df["question"] = test_dataset_dict["question"]
df["answer"] = test_dataset_dict["answer"]
df["context"] = df["question"].map(context_data)

In [None]:
from components.knn import KNN

k_value = 3
knn_classifier = KNN(k=k_value)

# Train the knn classifier on the train data
knn_classifier.fit(train_dataset_dict["question"], train_dataset_dict["answer"])

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from safetensors.torch import load_file

# Define repositories
# hub_repo = "Ana091202/flan_t5_timeline_qa"  # Your adapter model
hub_repo = "Ana091202/flan_t5_base_timeline_qa"
model = "base"

# Load tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained(hub_repo)
finetuned_model = T5ForConditionalGeneration.from_pretrained(hub_repo)

In [None]:
finetuned_model.eval()

In [None]:
# Create a summarization pipeline for the context
model_name = "facebook/bart-large-cnn"
summary_tokenizer = AutoTokenizer.from_pretrained(model_name)
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda:0")

# Create the pipeline using the initialized model and tokenizer
summarizer = pipeline("summarization", model=summarizer_model, tokenizer=AutoTokenizer.from_pretrained("google/flan-t5-large"), device=0)

In [None]:
pipe = pipeline(
    "text2text-generation",
    model=finetuned_model, max_new_tokens=256,
    tokenizer=tokenizer,
    do_sample=True,
    temperature=1e-5,
    top_p=0.9,
    device=0
)

results = []

max_length_context = 90

k_context = 10

# Prompt for each instance in the test set
for i, prompt_question in enumerate(tqdm(df["question"], desc="Processing Questions")):
    # Select few shot examples using the knn classifier
    closest_prompts = knn_classifier.query(prompt_question)
    few_shot_examples = examples = "\n".join(
        f"Example:\n  Question: {closest_prompts_question}\n  Answer: {closest_prompts_answers}" for closest_prompts_question, closest_prompts_answers in closest_prompts
    )

    top_k_contexts = df["context"].iloc[i][:k_context]
    top_k_contexts = [ctx.replace("\n", " ") for ctx in top_k_contexts]

    # Summarize each context individually
    summarized_contexts = []
    for ctx in top_k_contexts:
        # Tokenize the context
        tokenized_context = tokenizer(ctx, return_tensors="pt", truncation=False)
        token_length = len(tokenized_context["input_ids"][0])

        # Summarize if the context is too long
        if token_length > max_length_context:
            summarized_content = summarizer(ctx, max_length=max_length_context, min_length=30, truncation=True)[0]["summary_text"]
            summarized_contexts.append(summarized_content)
        else:
            summarized_contexts.append(ctx)

    # Combine summarized contexts
    combined_summaries = " ".join(summarized_contexts)


    prompt = f"""
    Generate a timeline-based answer for the following question by listing all entities, events, or attributes associated with the specified subject and time range. Ensure the output is in a structured, ordered format that reflects the temporal sequence accurately.

    You are given additional context about the query. Use the context as additional information to answer the query.

    Context:

    {combined_summaries}

    {few_shot_examples}

    Now, answer the following question:
    Question: {prompt_question}
    Answer:
    """

    generated_response = pipe(prompt.strip())[0]['generated_text']

    results.append({
        "question": prompt_question,
        "response": generated_response,
        "context": top_k_contexts,
        "examples": few_shot_examples
    })

results_df = pd.DataFrame(results)

In [None]:
results_df.to_csv(f"../results/rag_top_{k_context}_few_shot_{k_value}_{model}_finetuned.csv", index=False)