In [29]:
import numpy as np
from typing import List, Tuple, Dict
import openai
import os
from dotenv import load_dotenv
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import pickle

In [None]:
load_dotenv()

In [31]:
client = openai.OpenAI()

In [None]:
# For embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

In [33]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

In [34]:
def load_documents(folder_path):
    documents = {}
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        text = extract_text_from_pdf(file_path)
        documents[file] = text
    return documents

In [48]:
def create_summaries(docs):
    summaries = {}
    for doc_name, doc_text in docs.items():
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that generates a summary of an academic calendar, highlighting mainly the academic year and semester. Ensure that summary is concise and informative, since the metadata is extremely important for further tasks."},
                {"role": "user", "content": doc_text}
            ],
        )
        summaries[doc_name] = response.choices[0].message.content
    return summaries

In [36]:
def chunk_documents(docs, chunk_size=1000):
    chunked_docs = {}
    for filename, content in docs.items():
        chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
        chunked_docs[filename] = chunks
    return chunked_docs

In [37]:
def create_embeddings(docs):
    return model.encode(docs)

In [38]:
def create_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

In [39]:
def level1_rag(query, summaries, index, top_k):
    query_embedding = model.encode([query])

    # Finds the top_k most similar documents, using the summary embeddings
    distances, indices = index.search(query_embedding, top_k)

    # Returns the top_k most similar documents
    return [list(summaries.keys())[i] for i in indices[0]]

In [40]:
def level2_rag(query, chunks, index, top_k):
    query_embedding = model.encode([query])

    # Finds the top_k most similar chunks, using the chunk embeddings
    distances, indices = index.search(query_embedding, top_k)
    return [(idx, chunks[idx]) for idx in indices[0]]

In [41]:
def generate_response(query, context):
    context_text = "\n".join(context)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions about academic calendars based on the given context."},
            {"role": "user", "content": f"Context:\n{context_text}\n\nQuestion: {query}\nPlease answer the question based on the given context from the academic calendars."}
        ],
    )
    return response.choices[0].message.content

In [49]:
def hierarchical_rag(query, documents, summaries, summary_indices, chunk_indices, chunk_size):
    top_doc_filenames = level1_rag(query, summaries, summary_indices, 2)
    print(top_doc_filenames)

    all_top_chunks = []
    for filename in top_doc_filenames:
        chunks = chunk_documents({filename: documents[filename]}, chunk_size)[filename]
        top_chunks = level2_rag(query, chunks, chunk_indices[filename], 3)
        all_top_chunks.extend([chunk for _, chunk in top_chunks])
    
    response = generate_response(query, all_top_chunks)
    return response

In [50]:
folder_path = "academic_calendars"
documents = load_documents(folder_path)

In [51]:
summaries = create_summaries(documents)
summary_embeddings = create_embeddings(list(summaries.values()))
summary_index = create_faiss_index(summary_embeddings)

In [52]:
chunk_indices = {}
for filename, content in documents.items():
    chunks = chunk_documents({filename: content}, 1000)[filename]
    chunk_embeddings = create_embeddings(chunks)
    chunk_indices[filename] = create_faiss_index(chunk_embeddings)

In [None]:
query = "When was NUS Well-Being Day in Semester 1 of academic year 2022-2023?"
result = hierarchical_rag(query, documents, summaries, summary_index, chunk_indices, 1000)

In [55]:
# Testing with a variety of queries
queries = [
    "When was NUS Well-Being Day in Semester 1 of academic year 2022-2023?",
    "When does 'F' grade come into effect in Semester 2 academic year 2024-2025?",
    "How many holidays are there in Semester 2 of academic year 2023-2024?"
    "When does Special Term 1 start in academic year 2021-2022?"
]

In [None]:
for query in queries:
    result = hierarchical_rag(query, documents, summaries, summary_index, chunk_indices, 1000)
    print("Query:", query)
    print("Response:", result)

In [61]:
# Comparing with a vanilla RAG (only one level, no summaries are used)
all_documents = list(documents.values())
all_embeddings = create_embeddings(all_documents)
all_index = create_faiss_index(all_embeddings)
query = "When was NUS Well-Being Day in Semester 1 of academic year 2022-2023?"
top_k_chunks = level2_rag(query, all_documents, all_index, 5)
response = generate_response(query, [chunk for _, chunk in top_k_chunks])

In [None]:
# Testing the level 1 RAG
query = "When was NUS Well-Being Day in Semester 1 of academic year 2022-2023?"
top_docs = level1_rag(query, summaries, summary_index, 5)