In [None]:
pip install kagglehub

In [None]:
pip install transformers datasets sentence-transformers langchain chromadb

In [None]:
pip install sentence-transformers

In [None]:
pip install --upgrade "numpy>=2.0" "scipy>=1.14" scikit-learn transformers sentence-transformers chromadb


In [None]:
pip freeze > requirements.txt


In [None]:
import kagglehub
import json
import os
import numpy as np
import pandas as pd
import json
import re
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm  

In [None]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

In [None]:
# Download QASPER dataset using KaggleHub
dataset_path = kagglehub.dataset_download("thedevastator/qasper-nlp-questions-and-evidence")
print(f"Path to QASPER dataset: {dataset_path}")


In [None]:
#Importing QASPER dataset


test = pd.read_csv(f"{dataset_path}/test.csv")
train = pd.read_csv(f"{dataset_path}/train.csv")
validation = pd.read_csv(f"{dataset_path}/validation.csv")



We had issues with the Keggle CSV file for QASPER DATASET. Tried parsing it manually as well as using the nested objects, but it still gave errors. This may be because of the object is converted to a string and the string contains Object types like 'Array()' which cause the json parse error.

We are using the same data QASPER sets but via HuggingFace, which provides straightfaward json object to parse

### Load the QASPER from huggingface

In [None]:
qasper_ds = load_dataset("allenai/qasper", split="train")

In [None]:
qasper_ds

In [None]:
rows = []
for paper in qasper_ds:
    paper_id = paper["id"]
    title    = paper["title"]
    abstract = paper["abstract"]

    # full_text is a dict of columns
    sec_names = paper["full_text"]["section_name"]
    sec_paras = paper["full_text"]["paragraphs"]
    full_text = "\n\n".join(
        f"{sec}\n" + "\n".join(p) for sec, p in zip(sec_names, sec_paras)
    )

    qas = paper["qas"]
    n_questions = len(qas["question"])

    for i in range(n_questions):
        question_id   = qas["question_id"][i]
        question_text = qas["question"][i]
        nlp_bg        = qas["nlp_background"][i]
        topic_bg      = qas["topic_background"][i]
        paper_read    = qas["paper_read"][i]
        search_query  = qas["search_query"][i]
        question_writer = qas["question_writer"][i]

        # answers is ALSO a dict of parallel lists
        answers_block = qas["answers"][i]
        for ans, ann_id, worker_id in zip(
            answers_block["answer"],
            answers_block["annotation_id"],
            answers_block["worker_id"]
        ):
            rows.append({
                "paper_id"        : paper_id,
                "title"           : title,
                "abstract"        : abstract,
                "full_text"       : full_text,
                "question_id"     : question_id,
                "question"        : question_text,
                "nlp_background"  : nlp_bg,
                "topic_background": topic_bg,
                "paper_read"      : paper_read,
                "search_query"    : search_query,
                "question_writer" : question_writer,
                "annotation_id"   : ann_id,
                "worker_id"       : worker_id,
                "unanswerable"    : ans["unanswerable"],
                "yes_no"          : ans["yes_no"],
                "free_form_answer": ans["free_form_answer"],
                "extractive_spans": "; ".join(ans["extractive_spans"]),
                "evidence"        : "; ".join(ans["evidence"]),
                "highlighted_evidence": "; ".join(ans["highlighted_evidence"])
            })

qasper_df = pd.DataFrame(rows)

In [None]:
qasper_df.head()

In [None]:
len(qasper_df)

In [None]:
#Drop question rows where the answer is empty
qasper_df = qasper_df[qasper_df['free_form_answer'] != ""]

In [None]:
len(qasper_df)

In [None]:
qasper_df.head()

In [None]:
qasper_df.to_csv('processed_qasper_data.csv', index=False)

### Loading the HotPotQA Dataset from Kegglehub

In [None]:
# Download HotpotQA dataset using KaggleHub
path = kagglehub.dataset_download("jeromeblanchet/hotpotqa-question-answering-dataset")
print(f"Path to HotpotQA dataset: {path}")


In [None]:
#Importing HotPotQA Dataset
import json

with open(f"{path}/hotpot_dev_distractor_v1.json", "r") as f:
    hotpot_data = json.load(f)

print(len(hotpot_data))        
print(hotpot_data[0].keys())   

In [None]:
import pandas as pd

rows = []
for ex in hotpot_data:
    rows.append({
        "id": ex["_id"],
        "question": ex["question"],
        "answer": ex["answer"],
        "context": ex["context"],
        "supporting_facts": ex["supporting_facts"]
    })

hotpot_df = pd.DataFrame(rows)

In [None]:
hotpot_df.head()

In [None]:
hotpot_df.to_csv('processed_hotpot_df.csv', index=False)

## Step 1: Summarization

### We will store context in ChromaDb with chunking, for this task we will use 256 as our chunk size

In [None]:
def ingest_qasper_to_chroma(qasper_df, chroma_collection, character_splitter, token_splitter):
    for paper_id, group in tqdm(qasper_df.groupby("paper_id")):
        full_text = str(group.iloc[0]["full_text"])
    
        char_chunks = character_splitter.split_text(full_text)
    
        token_chunks = []
        for chunk in char_chunks:
            token_chunks.extend(token_splitter.split_text(chunk))
    
        if not token_chunks:
            print(f"Skipping paper {paper_id}: no chunks produced.")
            continue
        
        ids = [f"{paper_id}_{i}" for i in range(len(token_chunks))]
        question_ids = group["question_id"].tolist()
        metadatas = [
            {
                "paper_id": paper_id,
                "question_ids": ",".join(question_ids),
            }
            for _ in token_chunks
        ]
    
        chroma_collection.add(
            documents=token_chunks,
            ids=ids,
            metadatas=metadatas
        )
    
    print("All papers processed and stored in Chroma.")


In [None]:
def ingest_hotpot_to_chroma(hotpot_df, chroma_collection, character_splitter, token_splitter):

    for q_id, group in tqdm(hotpot_df.groupby("id"), desc="Processing Hotpot QA"):

        context_blocks = []
        for title, paragraphs in group.iloc[0]["context"]:
            section_text = "\n".join(paragraphs)
            context_blocks.append(f"{title}\n{section_text}")
        full_text = "\n\n".join(context_blocks)
        char_chunks = character_splitter.split_text(full_text)

        token_chunks = []
        for chunk in char_chunks:
            token_chunks.extend(token_splitter.split_text(chunk))

        if not token_chunks:
            print(f"Skipping question {q_id}: no context found produced.")
            continue

        ids = [f"{q_id}_{i}" for i in range(len(token_chunks))]
        
        metadatas = [
            {
                "hotpot_id": q_id
            }
            for _ in token_chunks
        ]

        chroma_collection.add(
            documents=token_chunks,
            ids=ids,
            metadatas=metadatas
        )

    print("All HotpotQA questions processed and stored in Chroma.")


### We have Store all the Data For QASPER DATA SET AND WE MOVE INTO LOAD MODELS NOTEBOOK FOR ANSWER GENERATION

In [None]:
DATASET_HOTPOT = "HotpotQA"
DATASET_QASPER = "Qasper"

TOKEN_CHUNK_SIZE = 256
TOKEN_CHUNK_OVERLAP = 10

CHAR_CHUNK_SIZE = 1000
CHAR_CHUNK_OVERLAP = 10

In [None]:
def run_data_loader_pipeline(DATASET_TYPE, collection, dataframe):
    
    character_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=CHAR_CHUNK_SIZE,
        chunk_overlap=CHAR_CHUNK_OVERLAP
    )

    token_splitter = SentenceTransformersTokenTextSplitter(
        chunk_overlap=TOKEN_CHUNK_OVERLAP,
        tokens_per_chunk=TOKEN_CHUNK_SIZE
    )

    if DATASET_TYPE == DATASET_QASPER:
        ingest_qasper_to_chroma(dataframe, collection, character_splitter, token_splitter)
    elif DATASET_TYPE == DATASET_HOTPOT:
        ingest_hotpot_to_chroma(dataframe, collection, character_splitter, token_splitter)
    else:
        print("Invalid Dataset Type")    

### Execute the data ingestion for Datasets

In [None]:
def create_chroma_collections(db_path: str, configs: dict):
    client = chromadb.PersistentClient(path=db_path)
    collections = {}

    for dataset_name, cfg in configs.items():
        coll_name  = cfg["collection"]
        model_name = cfg["model"]

        try:
            client.delete_collection(name=coll_name)
            print(f"Deleted old collection: {coll_name}")
        except Exception:
            pass  

        # create embedding function for this dataset
        embedding_fn = SentenceTransformerEmbeddingFunction(model_name=model_name)

        # create and store collection
        collections[dataset_name] = client.create_collection(
            name=coll_name,
            embedding_function=embedding_fn
        )
        print(f"Created collection '{coll_name}' with model '{model_name}'")

    return collections


In [None]:
DATASET_CONFIGS = {
    "HotpotQA": {
        "collection": "hpqa_data_collection",
        "model": "all-MiniLM-L6-v2"
    },
    "Qasper": {
        "collection": "qasper_data_collection",
        "model": "all-MiniLM-L6-v2"
    },
    # Team: Add configs here
}

collections = create_chroma_collections("./ChromaDb", DATASET_CONFIGS)

hotpot_collection = collections["HotpotQA"]
qasper_collection = collections["Qasper"]

run_data_loader_pipeline(DATASET_HOTPOT, hotpot_collection, hotpot_df)
run_data_loader_pipeline(DATASET_QASPER, qasper_collection, qasper_df)