Ingestion - Input data and chunk them

In [2]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
import pickle  # For saving chunks if needed

# Path to your data folder
data_dir = "data"  # Assumes /data in your repo root

# List to hold all chunks
all_chunks = []

# Loop over files 1.txt to 15.txt (skips missing ones)
for i in range(1, 11):  # 1 to 15
    file_path = os.path.join(data_dir, f"{i}.txt")
    if os.path.exists(file_path):
        print(f"Loading {file_path}...")
        loader = TextLoader(file_path, encoding="utf-8")  # Handles standard text
        docs = loader.load()
        text = " ".join([doc.page_content for doc in docs])  # Just the text

        # Clean: Remove extra whitespace, newlines (basic)
        text = ' '.join(text.split())  # Collapses multiples
        text = text.replace('\n', ' ')  # Flatten newlines if any

        # Chunk this file's text
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,  # ~500 chars/tokens
            chunk_overlap=50  # Overlap for context continuity

        )
        file_chunks = splitter.split_text(text)
        for chunk in file_chunks:
            all_chunks.append({'text': chunk, 'source': f"{i}.txt"})
        print(f"Added {len(file_chunks)} chunks from {i}.txt")
    else:
        print(f"Skipping {file_path} (not found)")

# Final count
print(f"Total chunks created: {len(all_chunks)}")


with open("chunks.pkl", "wb") as f:
    pickle.dump(all_chunks, f)
print("Chunks saved to chunks.pkl")

Loading data\1.txt...
Added 238 chunks from 1.txt
Loading data\2.txt...
Added 11 chunks from 2.txt
Loading data\3.txt...
Added 60 chunks from 3.txt
Loading data\4.txt...
Added 63 chunks from 4.txt
Loading data\5.txt...
Added 13 chunks from 5.txt
Loading data\6.txt...
Added 6 chunks from 6.txt
Loading data\7.txt...
Added 8 chunks from 7.txt
Loading data\8.txt...
Added 9 chunks from 8.txt
Loading data\9.txt...
Added 5 chunks from 9.txt
Loading data\10.txt...
Added 13 chunks from 10.txt
Total chunks created: 426
Chunks saved to chunks.pkl


Loading LLM model: Qwen/Qwen2.5-7B-Instruct-1M - better for long context

In [3]:
# from transformers import pipeline, set_seed

# generator = pipeline('question-answering', model='gpt2')
# set_seed(2025)

from huggingface_hub import InferenceClient


client = InferenceClient("Qwen/Qwen2.5-7B-Instruct-1M")


  from .autonotebook import tqdm as notebook_tqdm


Load processed text

In [4]:
import pickle
with open("chunks.pkl", "rb") as f:
    all_chunks = pickle.load(f)
print(f"Loaded {len(all_chunks)} chunks")  # e.g., 30 chunks

Loaded 426 chunks


Embed the chunks above

In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')


embed_model = SentenceTransformer('all-MiniLM-L6-v2')  # From Day 1
embeddings = embed_model.encode(all_chunks)  # List of arrays -> one big array
embeddings = np.array(embeddings).astype('float32')  # FAISS needs this
print(f"Embeddings shape: {embeddings.shape}")  # e.g., (30, 384) – chunks x dimensions

Embeddings shape: (426, 384)


Using Faiss for fast vector database searching

In [6]:
import faiss

dimension = embeddings.shape[1]  # e.g., 384
index = faiss.IndexFlatL2(dimension)  # Basic flat index (exact search, good for small data)
index.add(embeddings)  # Train/add your vectors #type: Ignore
print(f"Index built with {index.ntotal} vectors")  # Matches chunk count

Index built with 426 vectors


Chat completion and comparision between two pipelines

In [7]:
#Getting response
import pandas as pd
import time
from pipelines import *  # Assuming this imports retrieve_chunks, etc.
import re
from huggingface_hub import InferenceClient  # Add if not in pipelines

# Assume client is defined, e.g.
# client = InferenceClient(model="openai/gpt-oss-20b")

# Define augment_prompt_no_query based on the idea
def augment_prompt_no_query(retrieved_chunks):
    context_parts = []
    for chunk_tuple in retrieved_chunks:
        chunk_dict = chunk_tuple[0]  # {'text':, 'source':}
        source = chunk_dict['source']
        text = chunk_dict['text']
        context_parts.append(f"[Source: {source}]\n{text}")
    
    context = "\n\n".join(context_parts)
    prompt = f"""Use the following context to answer the question factually. If the context doesn't cover it, say "I don't have info on that."
When referencing information from the context, include the [Source: x.txt] inline in your answer where it's used.
    
Context: {context}"""
    return prompt

def compare_ragged_outputs(queries, embed_model, index, all_chunks, max_length=300):
    results = []
    for query in queries:
        start_time = time.time()
        
        # Plain LLM
        plain_messages = [{"role": "user", "content": query}]
        plain_response = client.chat_completion(plain_messages, max_tokens=300)
        plain_output = plain_response.choices[0].message.content  # Extract content
        plain_time = time.time() - start_time
        
        # RAG
        rag_start = time.time()
        retrieved = retrieve_chunks(query, embed_model, index, all_chunks)
        rag_messages = [{"role": "system", "content": augment_prompt_no_query(retrieved)}, {"role": "user", "content": query}]  # System first
        rag_response = client.chat_completion(rag_messages, max_tokens=300)
        ragged_output = rag_response.choices[0].message.content
        used_sources = ""
        if ragged_output:
            used_sources = set(re.findall(pattern=r'\[Source: \(\d+\.txt\)\]', string=ragged_output))
        
        # If no citations parsed, fall back to all retrieved sources
        if not used_sources:
            used_sources = set(chunk_tuple[0]['source'] for chunk_tuple in retrieved)
        
        # Sort and format
        sources_list = sorted(used_sources)
        sources_str = "Sources: " + ", ".join(sources_list) if used_sources else "No sources used."
        
        # Append to ragged_output
        full_output = ragged_output + "\n\n" + sources_str

        rag_time = time.time() - rag_start
        
        results.append({
            'query': query,
            'plain_answer': plain_output,
            'rag_answer': full_output,
            'plain_latency': plain_time,
            'rag_latency': rag_time
        })
        print(f"Processed: {query} | Plain: {plain_time:.2f}s | RAG: {rag_time:.2f}s")
    
    # Save to CSV if you want
    df = pd.DataFrame(results)
    # df.to_csv('comparison_results.csv', index=False)
    # print("Results saved to comparison_results.csv")
    return df



In [8]:
#TEST
# Test with 3 quick queries first
test_queries = ["Tell me what happened to Joe Biden's , the 46th president of the USA, wife", "Tell me Where was Joe Biden, the 46th president of the USA, born", "Tell me where did Joe Biden, the 46th president of the USA, graduated"]
df = compare_ragged_outputs(test_queries, embed_model, index, all_chunks, max_length=100)
df
#RAG shows no hallucination and cite its sources. Plain LLM hallucinates more. 

HfHubHTTPError: 504 Server Error: Gateway Time-out for url: https://router.huggingface.co/featherless-ai/v1/chat/completions