In [28]:
import re

file_path = 'D:/IBA/8th Semester/Musab_Bilal_RAG/src/Pipeline/prog-ann.txt'
# Read the content of the original file
with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
    content = file.read()

# Replace all tab characters with spaces
content = content.replace('\t', ' ')

# Shorten multiple spaces to a single space
content = re.sub(' +', ' ', content)

# Define the path for the new file
new_file_path = 'D:/IBA/8th Semester/Musab_Bilal_RAG/src/Pipeline/prog-llamaparse_spaces.txt'

# Write the modified content to the new file
with open(new_file_path, 'w', encoding='utf-8') as new_file:
    new_file.write(content)

In [29]:
import re
import tiktoken

def load_text(file_path):
    """Loads text from a .txt file and normalizes whitespace."""
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    text = re.sub(r"\n{3,}", "\n\n", text.strip())  # Normalize excessive newlines
    text = re.sub(r" +", " ", text)  # Replace multiple spaces with a single space
    return text

def split_into_sentences(text):
    """Splits text into sentences while keeping boundaries intact."""
    return re.split(r"(?<=[.!?])\s+", text)

def merge_small_chunks(chunks, min_tokens, tokenizer):
    """Merges small chunks intelligently while maintaining coherence."""
    merged_chunks = []
    current_chunk = ""

    for chunk in chunks:
        token_count = len(tokenizer.encode(current_chunk + " " + chunk))

        if token_count < min_tokens and current_chunk:
            current_chunk += " " + chunk
        else:
            if current_chunk:
                merged_chunks.append(current_chunk.strip())
            current_chunk = chunk

    if current_chunk:
        merged_chunks.append(current_chunk.strip())

    return merged_chunks

def enforce_max_chunk_size(chunks, max_tokens, tokenizer, overlap_ratio=0.15):
    """Ensures chunks do not exceed max token size while preserving coherence."""
    final_chunks = []

    for chunk in chunks:
        tokens = tokenizer.encode(chunk)

        if len(tokens) <= max_tokens:
            final_chunks.append(chunk)
        else:
            sentences = split_into_sentences(chunk)
            temp_chunk = ""

            for sentence in sentences:
                temp_chunk_tokens = len(tokenizer.encode(temp_chunk + " " + sentence))

                if temp_chunk_tokens <= max_tokens:
                    temp_chunk += " " + sentence
                else:
                    final_chunks.append(temp_chunk.strip())
                    overlap_tokens = int(len(tokenizer.encode(temp_chunk)) * overlap_ratio)
                    temp_chunk = tokenizer.decode(tokenizer.encode(temp_chunk)[-overlap_tokens:]) + " " + sentence

            if temp_chunk:
                final_chunks.append(temp_chunk.strip())

    return final_chunks

def process_text(file_path, min_tokens=200, max_tokens=250, overlap_ratio=0.15):
    """Processes text into optimized chunks based on token constraints."""
    text = load_text(file_path)
    tokenizer = tiktoken.get_encoding("cl100k_base")

    sentences = split_into_sentences(text)
    merged_chunks = merge_small_chunks(sentences, min_tokens, tokenizer)
    final_chunks = enforce_max_chunk_size(merged_chunks, max_tokens, tokenizer, overlap_ratio)
    
    return final_chunks

# Usage
file_path = 'D:/IBA/8th Semester/Musab_Bilal_RAG/src/Pipeline/prog-ann.txt'
chunks = process_text(file_path)
print(len(chunks), "chunks generated.")


336 chunks generated.


In [30]:
tokenizer = tiktoken.get_encoding("cl100k_base")

for chunk in chunks:
    if len(chunk) > 1000:
        print(f"Chunk size: {len(chunk)} characters, {len(tokenizer.encode(chunk))} tokens")


Chunk size: 1506 characters, 443 tokens
Chunk size: 1008 characters, 193 tokens
Chunk size: 2229 characters, 695 tokens
Chunk size: 1037 characters, 188 tokens
Chunk size: 1060 characters, 198 tokens
Chunk size: 1032 characters, 199 tokens
Chunk size: 1264 characters, 360 tokens
Chunk size: 1016 characters, 194 tokens
Chunk size: 1042 characters, 198 tokens
Chunk size: 1010 characters, 184 tokens
Chunk size: 1030 characters, 193 tokens
Chunk size: 1565 characters, 392 tokens
Chunk size: 1138 characters, 319 tokens
Chunk size: 1264 characters, 311 tokens
Chunk size: 1505 characters, 384 tokens
Chunk size: 1084 characters, 195 tokens
Chunk size: 1016 characters, 183 tokens
Chunk size: 3358 characters, 897 tokens
Chunk size: 1037 characters, 197 tokens
Chunk size: 2880 characters, 753 tokens
Chunk size: 1193 characters, 198 tokens
Chunk size: 1067 characters, 197 tokens
Chunk size: 1052 characters, 179 tokens
Chunk size: 1071 characters, 252 tokens
Chunk size: 1047 characters, 199 tokens


In [31]:

from nomic import embed
import numpy as np

output = embed.text(
    texts=chunks,
    model='nomic-embed-text-v1.5',
    task_type='search_document',
)

embeddings = np.array(output['embeddings'])
print(embeddings[0].shape)  # prints: (768,)

(768,)


In [32]:
vectors = []
for i, sentence in enumerate(chunks):
    vectors.append({
        "id": f"txt{i}",
        "values": embeddings[i].tolist(),
        "metadata": {
            "text": sentence
        }
    })

In [33]:
vectors[1]["metadata"]["text"]


'IBA Institute of\n Business Administr ation\n Karachi\nLeadership and Ideas for Tomorrow\nPROGRA M ANNOU NCEMENT\n2024-25\n 2\n V\n---\nPROGRAM\n ANNOUNCEMENT 2024-25\n---\n PROGRAM 03 Table of Conter ts\n ANNOUNCEMENT 2024-25\n\nMessage from the Executive Director 04 School of Economics & Social Sciences (SESS) 49\nMessage from the Registrar 05 Programs on Offer 51\nAcademic calendar 2024-2025 08 BS (Economics) 54\nIBA in Numbers 10 BS (Economics and Mathematics) 59\nAcademic programs 11 BS (Social Sciences and Liberal Arts) 64\nDeans and Chairpersons 14 MS (Development Studies) 75\nFee structure 15 MS (Economics) 77\nFinancial assistance program 16 MS (Journalism) 79\nFacilities at IBA 17 PhD (Economics) 80\nStudent services 20\nOfﬁce of Student Affairs 22 School of Mathematics & Computer Science (SMCS) 82\nActivities studio 25 Programs on Offer 83\n BS (Computer Science) 86\nSchools BS (Mathematics) 92\n MS (Computer Science) 95\nSchool of Business Studies (SBS) 28 MS (Data Science

In [34]:
from pinecone.grpc import PineconeGRPC as Pinecone
from dotenv import load_dotenv
import os
load_dotenv()


pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
# print("Pinecone Indices: ", pc.list_indexes())

index = pc.Index(name="musab-bilal-rag")
index.describe_index_stats()

index.upsert  ( 
    namespace="prog-ann",
    batch_size=32,
    vectors=vectors
    # vectors=embedding,
)

Upserted vectors: 100%|██████████| 336/336 [00:06<00:00, 55.24it/s]


upserted_count: 336