# 1. Analyzing the dataset

## 1.0 Imports

In [18]:
import pandas as pd

## 1.1 Loading the Data

### 1.1.1 COVIDQA

In [19]:
splits = {'train': 'covidqa/train-00000-of-00001.parquet', 'test': 'covidqa/test-00000-of-00001.parquet', 'validation': 'covidqa/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/rungalileo/ragbench/" + splits["train"])

### 1.1.2 CUAD

In [20]:
# splits = {'train': 'cuad/train-00000-of-00001.parquet', 'validation': 'cuad/validation-00000-of-00001.parquet', 'test': 'cuad/test-00000-of-00001.parquet'}
# df = pd.read_parquet("hf://datasets/rungalileo/ragbench/" + splits["train"])

### 1.1.3 FINQA

In [21]:
# splits = {'train': 'finqa/train-00000-of-00001.parquet', 'validation': 'finqa/validation-00000-of-00001.parquet', 'test': 'finqa/test-00000-of-00001.parquet'}
# df = pd.read_parquet("hf://datasets/rungalileo/ragbench/" + splits["train"])

## 1.2 Exploring

In [None]:
print(df.head())

In [None]:
# Print example question and calculate average length of questions
print(f"Example question: {df['question'][0]}")
average_question_length = df['question'].str.len().mean()
print(f"Average length of questions: {average_question_length}")

# Print example answer and calculate average length of answers
print(f"Example answer: {df['response'][0]}")
average_answer_length = df['response'].str.len().mean()
print(f"Average length of answers: {average_answer_length}")

# Print example documents and calculate average length of documents
print(f"Example documents: {df['documents'][0]}")

# Flatten the list of documents in the first row and calculate lengths
document_lengths = [len(doc) for doc in df['documents'][0]]
average_document_length = sum(document_lengths) / len(document_lengths)
print(f"Average length of documents: {average_document_length}")

# Calculate the average number of documents per row
average_number_of_documents = df['documents'].apply(len).mean()
print(f"Average number of documents per row: {average_number_of_documents}")


In [None]:
print(df.columns)

# 2. Processing

## 2.1 Summarizing documents

### 2.1.1 Loading Summarizer

In [None]:
# abstract
from transformers import pipeline

# Load the summarization model
abstractive_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Generate abstractive summaries
def generate_abstractive_summaries(documents):
    return [abstractive_summarizer(doc, max_length=100, min_length=25, do_sample=False)[0]['summary_text'] for doc in documents]

In [None]:
# extractive
from summarizer import Summarizer

# Load the extractive summarization model
extractive_summarizer = Summarizer()

# Generate extractive summaries
def generate_extractive_summaries(documents):
    return [extractive_summarizer(doc, ratio=0.2) for doc in documents]  # 20% of the text

### 2.1.2 Summarizing

In [None]:
# Apply to the dataset
df['abstractive_summary'] = df['documents'].apply(generate_abstractive_summaries)

# Apply to the dataset
df['extractive_summary'] = df['documents'].apply(generate_extractive_summaries)

## 2.2 Comparing results

In [None]:
print(f"Raw: {df['documents'][0]}")
print(f"Extractive: {df['extractive_summary'][0]}")
print(f"Abstractive: {df['abstractive_summary'][0]}")

# 3. Modeling

## 3.0 Imports

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import numpy as np
import faiss

## 3.1 Prepate data to model

In [None]:
# Prepare data for fine-tuning
def prepare_retrieval_training_data(df, input_column, target_column):
    """
    Prepares training data for retrieval fine-tuning.
    Each query is paired with its corresponding document (positive example).
    """
    positive_pairs = [
        InputExample(texts=[query, " ".join(docs)], label=1.0) 
        for query, docs in zip(df['question'], df[input_column])
    ]
    return positive_pairs

# Generate datasets for each setup
raw_data = prepare_retrieval_training_data(df, "documents", "response")
abstractive_data = prepare_retrieval_training_data(df, "abstractive_summary", "response")
extractive_data = prepare_retrieval_training_data(df, "extractive_summary", "response")

## 3.2 Train model

In [None]:
# Fine-tune the model for retrieval
def fine_tune_retriever(data, model_name, output_path):
    """
    Fine-tune the retriever using MultipleNegativesRankingLoss.
    """
    model = SentenceTransformer(model_name)
    train_dataloader = DataLoader(data, shuffle=True, batch_size=16)
    train_loss = losses.MultipleNegativesRankingLoss(model)
    
    # Train
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=3,
        warmup_steps=100,
        output_path=output_path
    )
    return model

# Fine-tune for each dataset (could use multi-qa-mpnet-base-dot-v1)
raw_retriever = fine_tune_retriever(raw_data, "all-MiniLM-L6-v2", "fine_tuned_retriever_raw")
abstractive_retriever = fine_tune_retriever(abstractive_data, "all-MiniLM-L6-v2", "fine_tuned_retriever_abstractive")
extractive_retriever = fine_tune_retriever(extractive_data, "all-MiniLM-L6-v2", "fine_tuned_retriever_extractive")

## 3.3 Evaluate Performance

### 3.3.1 Retrieval

In [None]:
def evaluate_retriever_accuracy(retriever, df, k_values, input_column):
    """
    Evaluates retrieval accuracy using Recall@k for different values of k.
    """
    correct_retrievals = {k: 0 for k in k_values}
    total_queries = len(df)

    for query, relevant_docs in zip(df['question'], df[input_column]):
        query_embedding = retriever.encode([query])
        documents = [" ".join(docs) for docs in df[input_column]]
        document_embeddings = retriever.encode(documents)

        distances = faiss.IndexFlatL2(document_embeddings.shape[1])
        distances.add(np.array(document_embeddings, dtype="float32"))
        _, indices = distances.search(np.array(query_embedding, dtype="float32"), max(k_values))

        for k in k_values:
            retrieved_docs = [documents[idx] for idx in indices[0][:k]]
            if any(doc in retrieved_docs for doc in relevant_docs):
                correct_retrievals[k] += 1

    recall_at_k = {k: correct_retrievals[k] / total_queries for k in k_values}
    return recall_at_k

# Evaluate each retriever
k_values = [1, 5, 10]
print("Raw Data Retriever Accuracy:", evaluate_retriever_accuracy(raw_retriever, df, k_values, "documents"))
print("Abstractive Retriever Accuracy:", evaluate_retriever_accuracy(abstractive_retriever, df, k_values, "abstractive_summary"))
print("Extractive Retriever Accuracy:", evaluate_retriever_accuracy(extractive_retriever, df, k_values, "extractive_summary"))


### 3.3.2 Generation

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_metric

# Load the model and tokenizer
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Prepare evaluation data
def prepare_t5_inputs(df, input_column):
    return [
        {"input_text": f"question: {query} context: {context}", "reference": answer}
        for query, context, answer in zip(df['question'], df[input_column], df['response'])
    ]

# Evaluate answer generation
def evaluate_t5_model(data, t5_model, t5_tokenizer):
    """
    Evaluates the T5 model's answer quality using BLEU and ROUGE scores.
    """
    bleu_metric = load_metric("bleu")
    rouge_metric = load_metric("rouge")
    predictions = []
    references = []

    for sample in data:
        inputs = t5_tokenizer(sample["input_text"], return_tensors="pt", truncation=True, padding="max_length", max_length=512)
        outputs = t5_model.generate(inputs['input_ids'], max_length=128, min_length=10)
        predictions.append(t5_tokenizer.decode(outputs[0], skip_special_tokens=True))
        references.append(sample["reference"])

    bleu_score = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)

    return bleu_score, rouge_score

# Prepare data and evaluate each setup
raw_t5_data = prepare_t5_inputs(df, "documents")
abstractive_t5_data = prepare_t5_inputs(df, "abstractive_summary")
extractive_t5_data = prepare_t5_inputs(df, "extractive_summary")

print("Raw Data T5 Evaluation:", evaluate_t5_model(raw_t5_data, t5_model, t5_tokenizer))
print("Abstractive T5 Evaluation:", evaluate_t5_model(abstractive_t5_data, t5_model, t5_tokenizer))
print("Extractive T5 Evaluation:", evaluate_t5_model(extractive_t5_data, t5_model, t5_tokenizer))
