In [None]:
# !pip install pdfplumber
# !pip install langchain-text-splitter
# !pip install sentence_transformers
# !pip install faiss-cpu
# !pip install groq


Conversion to plain text

In [None]:
import os
import pdfplumber

# Ensure the folder exists
output_folder = "textconversion"
os.makedirs(output_folder, exist_ok=True)

# Define the output file path
output_path = os.path.join(output_folder, "output.txt")

# Extract text and write to the file
with pdfplumber.open("Company-Policy-and-Procedure-June-1.18-V6.0.pdf") as pdf, open(output_path, "w", encoding="utf-8") as f:
    for page in pdf.pages:
        t = page.extract_text()
        if t:
            f.write(t + '\n')


In [None]:
with open("/content/textconversion/output.txt", "r", encoding="utf-8") as document:
    text = document.read()

In [None]:
print(len(text))

Text Splitting

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

texts = text_splitter.split_text(text)



Conversion of Chunks to Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
vectors = model.encode(texts)

vector_folder = "vectors"
os.makedirs(vector_folder, exist_ok=True)

output_path = os.path.join(vector_folder, "embeddings.npy")
np.save(output_path, vectors)




Visualize chunks

In [None]:
# ✅ VISUALIZE CHUNKS 
print("\n🔍 Sample Chunks and Their Lengths:\n")
for i, chunk in enumerate(texts):
    print(f"Chunk {i+1}:")
    print(chunk)
    print(f"Token Length: {len(chunk.split())} words\n{'-'*40}")


Store Embeddings

In [None]:
import faiss

dimension = vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(vectors))

embedding_folder = "embeddings"
os.makedirs(embedding_folder, exist_ok=True)


index_path = os.path.join(embedding_folder, "faiss_index.index")
faiss.write_index(index, index_path)



Query Search

In [None]:
query = "Age to acess the TL Website?"
query_embedding = model.encode([query])
D, I = index.search(np.array(query_embedding), k=1)

for idx in I[0]:
    print(f"Match: {texts[idx]}")

Importing GROQ

In [None]:
import os
from groq import Groq

client = Groq(api_key="use your own api key lol")


Query Searching and LLM Generation

In [None]:
query = "who makes the complains?"
query_embedding = model.encode([query])
D, I = index.search(np.array(query_embedding), k=3)

context = texts[I[0][0]]

rag_prompt = f"""
Use the context below to answer the question.

Context:
{context}

Question:
{query}

Answer:
"""


In [None]:
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "You are a helpful assistant using provided context."},
        {"role": "user", "content": rag_prompt}
    ],
    temperature=0.2
)

print(response.choices[0].message.content)

Query Searching with Reranking chunks

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np
import faiss

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

query = "summarize this"

query_embedding = embedding_model.encode([query])
D, I = index.search(np.array(query_embedding), k=10)

retrieved_chunks = [texts[i] for i in I[0]]

print("\n🔍 Top 10 Retrieved Chunks (Pre-Rerank):")
for i, chunk in enumerate(retrieved_chunks, 1):
    print(f"\n[{i}] {chunk[:200]}...")

rerank_inputs = [(query, chunk) for chunk in retrieved_chunks]
scores = reranker.predict(rerank_inputs)

scored_chunks = list(zip(scores, retrieved_chunks))
scored_chunks.sort(reverse=True, key=lambda x: x[0])

print("\n📊 Reranked Chunks (with Scores):")
for i, (score, chunk) in enumerate(scored_chunks, 1):
    print(f"\n[{i}] Score: {score:.4f}")
    print(f"{chunk[:200]}...")

top_chunks = [chunk for _, chunk in scored_chunks[:3]]
context = "\n\n".join(top_chunks)

print("\n🧩 Final Chunks Used in Context:")
for i, chunk in enumerate(top_chunks, 1):
    print(f"\n[{i}] {chunk[:200]}...")

rag_prompt = f"""
Use the context below to answer the question.

Context:
{context}

Question:
{query}

Answer:
"""

print("\n🧠 Final RAG Prompt:")
print(rag_prompt)


In [None]:
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "You are a helpful assistant using provided context."},
        {"role": "user", "content": rag_prompt}
    ],
    temperature=0.2
)

print(response.choices[0].message.content)

Text splitting with meta data

Rag with text-splitting and meta data

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(text)

# Wrap each chunk with metadata
texts = [
    {
        "id": i,
        "text": chunk,
        "metadata": {
            "chunk_id": i,
            "start_char": text.find(chunk),
            "end_char": text.find(chunk) + len(chunk),
            "source": "your_file_name_or_path.pdf"
        }
    }
    for i, chunk in enumerate(chunks)
]


Conversion of Chunks with meta data to Embeddings

In [None]:
# for meta data encoding we need to extarct text from dictionary it wont work directly
model = SentenceTransformer('all-MiniLM-L6-v2')
text_contents = [chunk["text"] for chunk in texts]
vectors = model.encode(text_contents)

vector_folder = "vectors"
os.makedirs(vector_folder, exist_ok=True)
output_path = os.path.join(vector_folder, "embeddings.npy")
np.save(output_path, vectors)

For visuals of chunks with meta data

In [None]:
# ✅ VISUALIZE CHUNKS 
print("\n🔍 Sample Chunks and Their Lengths:\n")
for i, chunk in enumerate(texts):
    print(f"Chunk {i+1}:")
    print(chunk)
    print(f"Token Length: {len(chunk['text'].split())} words\n{'-'*40}")


Store Embeddings

In [None]:
import faiss

dimension = vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(vectors))

embedding_folder = "embeddings"
os.makedirs(embedding_folder, exist_ok=True)


index_path = os.path.join(embedding_folder, "faiss_index.index")
faiss.write_index(index, index_path)



Query Searching with Reranking chunks with meta data

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np
import faiss

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

query = "who makes the complains?"

query_embedding = embedding_model.encode([query])
D, I = index.search(np.array(query_embedding), k=10)

retrieved_chunks = [texts[i] for i in I[0]]

print("\n🔍 Top 10 Retrieved Chunks (Pre-Rerank):")
for i, chunk in enumerate(retrieved_chunks, 1):
    print(f"\n[{i}] {chunk['text'][:200]}...")  # Access text part only

rerank_inputs = [(query, chunk["text"]) for chunk in retrieved_chunks]
scores = reranker.predict(rerank_inputs)

scored_chunks = list(zip(scores, retrieved_chunks))
scored_chunks.sort(reverse=True, key=lambda x: x[0])  # Sort by score descending

print("\n📊 Reranked Chunks (with Scores):")
for i, (score, chunk) in enumerate(scored_chunks, 1):
    print(f"\n[{i}] Score: {score:.4f}")
    print(f"{chunk['text'][:200]}...")

top_chunks = [chunk["text"] for _, chunk in scored_chunks[:3]]
context = "\n\n".join(top_chunks)

print("\n🧩 Final Chunks Used in Context:")
for i, chunk in enumerate(top_chunks, 1):
    print(f"\n[{i}] {chunk[:200]}...")

rag_prompt = f"""
Use the context below to answer the question.

Context:
{context}

Question:
{query}

Answer:
"""

print("\n🧠 Final RAG Prompt:")
print(rag_prompt)


In [None]:
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "You are a helpful assistant using provided context."},
        {"role": "user", "content": rag_prompt}
    ],
    temperature=0.2
)

print(response.choices[0].message.content)