Ingestion - Input data and chunk them

In [1]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader, PyPDFLoader
import pickle  # For saving chunks if needed

# Path to your data folder
data_dir = "data"  # Assumes /data in your repo root

# List to hold all chunks
all_chunks = []
source_map = {}

#Getting all files
files = [f for f in os.listdir(data_dir) if f.lower().endswith(('.txt', '.pdf'))] # only support text file and pdf file for data extraction

for id, file_name in enumerate(files, start=1):
    file_path = os.path.join(data_dir, file_name)
    source_map[id] = file_name

    #Load according to the file extension
    if file_name.lower().endswith(".txt"):
        loader = TextLoader(file_path, encoding="utf-8")
    elif file_name.lower().endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    else:
        continue # skip loop if file is not supported

    docs = loader.load()
    text = " ".join([doc.page_content for doc in docs])
    text = ' '.join(text.split())  # Collapses multiples
    text = text.replace('\n', ' ')  # Flatten newlines if any

    splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,  # ~500 chars/tokens
            chunk_overlap=50  # Overlap for context continuity

        )
    file_chunks = splitter.split_text(text)
    for chunk in file_chunks:
        all_chunks.append({'text': chunk, 'source_id': id}) #updated to cite according to sources' name
    
    print(f"Processed {file_name[:10]} as source_id {id}, added {len(file_chunks)} chunks")


# Final count
print(f"Total chunks created: {len(all_chunks)}")

# Save data 
with open("chunks.pkl", "wb") as f:
    pickle.dump(all_chunks, f)
with open("source_map.pkl", "wb") as f:
    pickle.dump(source_map, f)

print("Data retrieval is complete")

  from .autonotebook import tqdm as notebook_tqdm


Processed 1.txt as source_id 1, added 238 chunks
Processed 10.txt as source_id 2, added 13 chunks
Processed 2.txt as source_id 3, added 11 chunks
Processed 3.txt as source_id 4, added 60 chunks
Processed 4.txt as source_id 5, added 63 chunks
Processed 5.txt as source_id 6, added 13 chunks
Processed 6.txt as source_id 7, added 6 chunks
Processed 7.txt as source_id 8, added 8 chunks
Processed 8.txt as source_id 9, added 9 chunks
Processed 9.txt as source_id 10, added 5 chunks
Total chunks created: 426
Data retrieval is complete


Huggingface.co: gpt-oss-20b — for lower latency, and local or specialized use cases (21B parameters with 3.6B active parameters)

In [2]:
# from transformers import pipeline, set_seed

# generator = pipeline('question-answering', model='gpt2')
# set_seed(2025)

from huggingface_hub import InferenceClient


client = InferenceClient(model="openai/gpt-oss-20b")


Load processed text

In [3]:
import pickle
with open("chunks.pkl", "rb") as f:
    all_chunks = pickle.load(f)
print(f"Loaded {len(all_chunks)} chunks")  # e.g., 30 chunks

Loaded 426 chunks


Embed the chunks above

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')


embed_model = SentenceTransformer('all-MiniLM-L6-v2') 
embeddings = embed_model.encode(all_chunks)  #
embeddings = np.array(embeddings).astype('float32')  # FAISS needs this
print(f"Embeddings shape: {embeddings.shape}")  # e.g., (30, 384) – chunks x dimensions

Embeddings shape: (426, 384)


Using Faiss for fast vector database searching

In [5]:
import faiss

dimension = embeddings.shape[1]  # e.g., 384
index = faiss.IndexFlatL2(dimension)  # Basic flat index (exact search, good for small data)
index.add(embeddings)  # Train/add your vectors
print(f"Index built with {index.ntotal} vectors")  # Matches chunk count

Index built with 426 vectors


Chat completion and comparision between two pipelines

In [None]:
import pandas as pd
import time
from pipelines import * 
import re
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pickle  


def augment_prompt_no_query(retrieved_chunks):
    context_parts = []
    for chunk_tuple in retrieved_chunks:
        chunk_dict = chunk_tuple[0]  # {'text':, 'source_id':}
        source_id = chunk_dict['source_id']
        text = chunk_dict['text']
        context_parts.append(f"[Source ID: {source_id}]\n{text}")
    
    context = "\n\n".join(context_parts)
    prompt = f"""Use the following context to answer the question factually. If the context doesn't cover it, say "I don't have info on that. You MUST always provide a response—do not return nothing or an empty answer"
    
            Context: {context}"""
    return prompt  # this is prompt engineering

 # Use embedding model for Post-Generation Similarity Matching in conjunction with traditional regex searching
def attribute_sources(ragged_output, retrieved_chunks, embed_model, threshold=0.7):
    if not ragged_output:
        return []
    

    sentences = [s.strip() for s in ragged_output.split('.') if s.strip()]
    
    if not sentences:
        return []
    
    # Embed response sentences
    sent_embeds = embed_model.encode(sentences)
    
    # Embed retrieved chunks
    chunk_texts = [chunk_tuple[0]['text'] for chunk_tuple in retrieved_chunks]
    chunk_embeds = embed_model.encode(chunk_texts)
    
    used_source_ids = set()
    for sent_embed in sent_embeds:
        similarities = util.cos_sim(sent_embed, chunk_embeds)[0]
        max_sim_idx = np.argmax(similarities)
        if similarities[max_sim_idx] >= threshold:
            source_id = retrieved_chunks[max_sim_idx][0]['source_id']
            used_source_ids.add(source_id)
    
    return sorted(used_source_ids)

def compare_ragged_outputs(queries, embed_model, index, all_chunks, max_length=300):
    with open("source_map.pkl", "rb") as f:
        SOURCE_MAP = pickle.load(f)
    results = []
    for query in queries:
        start_time = time.time()
        
        # Plain LLM
        plain_messages = [{"role": "system", "content": "You MUST always provide a response—do not return nothing or an empty answer"}, {"role": "user", "content": query}]
        plain_response = client.chat_completion(plain_messages, max_tokens=300)
        plain_output = plain_response.choices[0].message.content  # Extract content
        plain_time = time.time() - start_time

        if plain_output is None: # can still return None if there is error in API call
            plain_output = ""  # Treat as empty string
            print(f"Warning: LLM returned None for query '{query}'—handling as empty.")

        # RAG
        rag_start = time.time()
        retrieved = retrieve_chunks(query, embed_model, index, all_chunks)
        rag_messages = [{"role": "system", "content": augment_prompt_no_query(retrieved)}, {"role": "user", "content": query}]  # System first
        rag_response = client.chat_completion(rag_messages, max_tokens=300)
        ragged_output = rag_response.choices[0].message.content
        used_sources = set()
        
        if ragged_output is None:
            ragged_output = ""  # Treat as empty string
            print(f"Warning: RAG model returned None for query '{query}'—handling as empty.")

        # Step 1: Try regex parsing for inline citations (as fallback or hybrid)
        used_source_ids_regex = set(re.findall(r'\[Source ID: \d+\]', ragged_output))  # Fixed regex to match [Source ID: x]
        
        # Step 2: Use similarity matching as primary method
        used_source_ids_sim = set(attribute_sources(ragged_output, retrieved, embed_model))
        
        # Combine: Use similarity primarily, add regex if any unique
        used_source_ids = used_source_ids_sim.union(used_source_ids_regex)
        
        # Map to file names
        used_sources = {SOURCE_MAP[int(sid)] for sid in used_source_ids if int(sid) in SOURCE_MAP}
        
        # If no citations found, fall back to all retrieved sources
        if not used_sources:
            used_source_ids_all = set(chunk_tuple[0]['source_id'] for chunk_tuple in retrieved)
            used_sources = {SOURCE_MAP[sid] for sid in used_source_ids_all if sid in SOURCE_MAP}
        
        # Sort and format
        sources_list = sorted(used_sources)
        sources_str = "Sources: " + ", ".join(sources_list) if sources_list else "No sources used."
        
        # Append to ragged_output
        full_output = ragged_output + "\n\n" + sources_str if ragged_output else "No output generated."

        rag_time = time.time() - rag_start
        
        results.append({
            'query': query,
            'plain_answer': plain_output,
            'rag_answer': full_output,
            'plain_latency': plain_time,
            'rag_latency': rag_time
        })
        print(f"Processed: {query} | Plain: {plain_time:.2f}s | RAG: {rag_time:.2f}s")
    
    # Save to CSV if you want
    df = pd.DataFrame(results)
    # df.to_csv('comparison_results.csv', index=False)
    # print("Results saved to comparison_results.csv")
    return df

In [None]:
#TEST
# Test with 3 quick queries 
test_queries = ["What happened to Bidens' wife?", "Where was Joe Biden born?", "Where did Joe Biden graduated?"]
df = compare_ragged_outputs(test_queries, embed_model, index, all_chunks, max_length=100)
df
#RAG shows no hallucination and cite its sources. Plain LLM hallucinates more. 

Processed: What happened to Bidens' wife? | Plain: 2.85s | RAG: 1.67s
Processed: Where was Joe Biden born? | Plain: 1.00s | RAG: 1.13s
Processed: Where did Joe Biden graduated? | Plain: 2.31s | RAG: 2.22s


Unnamed: 0,query,plain_answer,rag_answer,plain_latency,rag_latency
0,What happened to Bidens' wife?,There’s no credible evidence that anything bad...,I don't have info on that.\n\nSources: 1.txt,2.852008,1.666628
1,Where was Joe Biden born?,"Joe Biden was born in Scranton, **Pennsylvania...","Joe Biden was born in Scranton, Pennsylvania.\...",1.001261,1.129236
2,Where did Joe Biden graduated?,,Joe Biden graduated from the University of Del...,2.311671,2.221757
