### Read the files using langchain and store all the text in a single variable

In [None]:
import os

In [2]:
!pip install langchain langchain-community pdf2image pypdf python-docx



In [3]:
os.getcwd()

'd:\\Sonata_POC'

In [4]:
files_path = os.path.join(os.getcwd(), 'docs')
files_path

'd:\\Sonata_POC\\docs'

In [5]:
os.listdir(files_path)

['25939_Srikrupa_Cv-1.docx',
 '26107_Mahir Baig2.pdf',
 'design-document.md',
 'test-cases.md',
 'test-report.md']

In [6]:
for i in os.listdir(files_path):
    if i.endswith('.pdf'):
        print(i)
    elif i.endswith('.docx'):
        print(i)
    elif i.endswith('.txt'):
        print(i)


25939_Srikrupa_Cv-1.docx
26107_Mahir Baig2.pdf


In [7]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_community.document_loaders import TextLoader
from docx import Document
import os

# Function to load DOCX files
def load_docx_files(directory):
    """Load all DOCX files from directory"""
    documents = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.docx'):
                file_path = os.path.join(root, file)
                doc = Document(file_path)
                text = "\n".join([para.text for para in doc.paragraphs])
                documents.append({
                    'content': text,
                    'source': file_path
                })
    return documents

# Load all documents from the directory
pdf_loader = DirectoryLoader(files_path, glob="**/*.pdf", loader_cls=PyPDFLoader)
txt_loader = DirectoryLoader(files_path, glob="**/*.txt", loader_cls=TextLoader)

documents = []
documents.extend(pdf_loader.load())
documents.extend(txt_loader.load())
documents.extend(load_docx_files(files_path))

# Combine all text into a single variable
all_text = "\n\n".join([
    doc.page_content if hasattr(doc, 'page_content') else doc['content'] 
    for doc in documents
])

print(f"✓ Loaded {len(documents)} documents")
print(f"✓ Total text length: {len(all_text)} characters")

✓ Loaded 3 documents
✓ Total text length: 14124 characters


### Convert Single variables to vectors using embedding models

In [8]:
from sentence_transformers import SentenceTransformer

# Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert the text into vectors
text_embedding = model.encode(all_text)

print(f"✓ Embedding created")
print(f"✓ Embedding shape: {text_embedding.shape}")
print(f"✓ Vector dimensions: {len(text_embedding[0]) if isinstance(text_embedding, list) else text_embedding.shape[-1]}")

✓ Embedding created
✓ Embedding shape: (384,)
✓ Vector dimensions: 384


### Store in Chroma db

In [9]:
import chromadb
import hashlib

# Initialize Chroma client
client = chromadb.Client()

# Delete existing collection if it exists
try:
    client.delete_collection(name="document_embeddings")
except Exception:
    pass

# Create fresh collection
collection = client.get_or_create_collection(name="document_embeddings")

# Prepare document chunks and metadata
doc_contents = []
metadatas = []
ids = []

# Split documents into chunks (512 tokens with 20 token overlap for RAG)
chunk_size = 2000  # approximate characters
overlap = 400

for doc_idx, doc in enumerate(documents):
    # Get document content
    content = doc.page_content if hasattr(doc, 'page_content') else doc['content']
    source = doc.metadata.get('source') if hasattr(doc, 'metadata') else doc.get('source', f'doc_{doc_idx}')
    
    # Create chunks
    for chunk_idx in range(0, len(content), chunk_size - overlap):
        chunk = content[chunk_idx:chunk_idx + chunk_size]
        
        if len(chunk.strip()) > 50:  # Skip very small chunks
            chunk_id = f"{doc_idx}_chunk_{chunk_idx}"
            doc_contents.append(chunk)
            ids.append(chunk_id)
            metadatas.append({
                'source': str(source),
                'chunk_idx': chunk_idx,
                'doc_idx': doc_idx
            })

# Add documents with their embeddings to Chroma
if doc_contents:
    collection.add(
        ids=ids,
        documents=doc_contents,
        metadatas=metadatas,
        embeddings=[model.encode(doc).tolist() for doc in doc_contents]
    )
    print(f"✓ Stored {len(ids)} document chunks in Chroma DB")
    print(f"✓ Collection name: {collection.name}")
    print(f"✓ Collection count: {collection.count()}")
else:
    print("✗ No documents to store in Chroma DB")

✓ Stored 10 document chunks in Chroma DB
✓ Collection name: document_embeddings
✓ Collection count: 10


### search-cosine search or hybrid serach ---- chuncks retrive

In [10]:
# Function to search and retrieve chunks from ChromaDB
def search_documents(query, n_results=5):
    """
    Search for relevant document chunks using cosine similarity
    
    Args:
        query: Search query string
        n_results: Number of results to return
    
    Returns:
        Dictionary containing search results with documents and metadata
    """
    # Encode the query using the same model
    query_embedding = model.encode(query).tolist()
    
    # Search in the collection
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=["documents", "metadatas", "distances"]
    )
    
    return results

# Example usage
query = "What is the experience of mahir?"
search_results = search_documents(query, n_results=5)

# Display results
print(f"Query: {query}\n")
print(f"Found {len(search_results['documents'][0])} relevant chunks:\n")
for i, (doc, metadata, distance) in enumerate(zip(
    search_results['documents'][0], 
    search_results['metadatas'][0],
    search_results['distances'][0]
), 1):
    print(f"--- Result {i} ---")
    print(f"Source: {metadata['source']}")
    print(f"Similarity Score: {1 - distance:.4f}")
    print(f"Content: {doc[:300]}...\n")

Query: What is the experience of mahir?

Found 5 relevant chunks:

--- Result 1 ---
Source: d:\Sonata_POC\docs\26107_Mahir Baig2.pdf
Similarity Score: -0.6325
Content: Summary 
 
                                                       Mahir Baig 
                                                           GenAI Engineer  
 
Bangalore 9066340827 mahirbaig2@gmail.com 
I am a highly Accomplished Data Science professional with 4+ years of experience in predictive modeli...

--- Result 2 ---
Source: d:\Sonata_POC\docs\25939_Srikrupa_Cv-1.docx
Similarity Score: -0.6855
Content: Sri Krupa R
	GenAI Engineer	

Bangalore	7676200408	srikrupa738@gmail.com
Highly accomplished Data Science professional with over 4+ years of experience in solving complex challenges across diverse sectors. Strong expertise in predictive modeling, advanced analytics, and machine learning, with 2+ yea...

--- Result 3 ---
Source: d:\Sonata_POC\docs\26107_Mahir Baig2.pdf
Similarity Score: -0.7499
Content: ews for solar des

### use groq any llm to get final answer

In [None]:
from groq import Groq

# Initialize Groq client
groq_client = Groq(api_key="")

# Prepare the context from search results
context = "\n\n".join([
    doc for doc in search_results['documents'][0]
])

# Create the prompt with context and query
prompt = f"""Based on the following context, answer the question accurately and concisely.

Context:
{context}

Question: {query}

Answer:"""

# Get completion from Groq
completion = groq_client.chat.completions.create(
    model="openai/gpt-oss-20b",
    messages=[
        {
            "role": "user",
            "content": prompt
        }
    ],
    temperature=0.7,
    max_tokens=1024,
    top_p=1,
    stream=True
)

# Stream and print the response
print(f"\nAnswer to '{query}':\n")
for chunk in completion:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="")


Answer to 'What is the experience of mahir?':

Mahir Baig has **4+ years of data‑science experience**—including **2+ years focused on Generative AI**.  He works as a **Digital Engineer at Sonata Software Ltd. (since December 2023)**, where he provides advanced technical support and builds Azure‑based LLM, RAG, and Agentic‑AI solutions.

In [1]:
1/0

ZeroDivisionError: division by zero

In [2]:
try:
    1/0
except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: division by zero


In [3]:
from datetime import datetime
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
TIMESTAMP

'2025-12-19_16-40-51'

In [4]:
import socket

In [5]:
socket.gethostname()

'mahir'

In [6]:
from src.config.constants import TIMESTAMP

In [7]:
class mahir:
    def __init__(self):
        print(socket.gethostname(), TIMESTAMP)

In [8]:
a = mahir()

mahir 2025-12-19_17-16-22


In [9]:
def addition(a, b):
    return a + b

In [10]:
addition(a=5, b=10)

15

In [11]:
addition(2, 3)

5

In [13]:
addition(2,4,5)

TypeError: addition() takes 2 positional arguments but 3 were given

In [4]:
from src.services.azure_blob_service import AzureBlobManager

In [5]:
sc = AzureBlobManager(connection_string='DefaultEndpointsProtocol=https;AccountName=poc123;AccountKey=rOkUqgjfGmAH9ViVfv5zuOV4U1rk3IlDKHxjj2unJ/uCz+dBKM7sAdsBBwk2EPdnkAM3Wi1hSMXs+AStUpE1hQ==;EndpointSuffix=core.windows.net')

In [4]:
sc.container_exists("accepted")

True

In [5]:
sc.container_exists('erwgtwh')

False

In [6]:
sc.list_blob_names_and_files("accepted")

({'WhatsApp Image 2025-12-04 at 9.58.59 AM.jpeg'},
 ['WhatsApp Image 2025-12-04 at 9.58.59 AM.jpeg'])

In [10]:
sc.list_blob_names_and_files('rejected')

(set(), [])

In [11]:
"fdbsnjksks is 'cdsdgdsbg'bjm,smk"

"fdbsnjksks is 'cdsdgdsbg'bjm,smk"

In [17]:
a :str = "dgdg"

if a:
    print("a is defined")
    print(type(a))
else:
    print("a is not defined")
    print(type(a))

a is defined
<class 'str'>


In [1]:
import os
os.getcwd()

'd:\\Sonata_POC'

In [2]:
from src.components.ingest_files import ingest_files_from_azure_blob
import os
a = ingest_files_from_azure_blob("accepted", 'blob')
a

'blob'

In [3]:
ingest_files_from_azure_blob("accepted", os.getcwd(),'a656b7a74dbaf2_Mahir_resume.docx')

In [None]:
os.path.join(os.getcwd(), '25939_Srikrupa_Cv-1.docx')

'd:\\Sonata_POC\\25939_Srikrupa_Cv-1.docx'

In [17]:
from src.services.azure_blob_service import AzureBlobManager

m = AzureBlobManager()
file_path = r'25939_Srikrupa_Cv-1.docx'
m.upload_files("accepted", os.path.join(os.getcwd(), 'blob'),"test")

In [8]:
file_extension = 'd:\\Sonata_POC\\blob' 
file_extension = file_extension.lower()
file_extension

'd:\\sonata_poc\\blob'

In [9]:
file_dir = os.path.basename(file_extension)
file_dir

'blob'

In [10]:
a=os.listdir(file_dir)[-1]
a

'WhatsApp Image 2025-12-04 at 9.58.59 AM.jpeg'

In [12]:
if a.endswith('.docx'):
    print('it ends with docx')

it ends with docx


In [19]:
_, file_extension = os.path.splitext('d:\\Sonata_POC\\blob')

In [20]:
file_extension

''

In [21]:
_

'd:\\Sonata_POC\\blob'

In [4]:
from src.components.extractor import FileExtractor
import os

In [5]:
files = FileExtractor(file_path=r'd:\Sonata_POC\blob')

In [6]:
files.extract_pdf(filename='26107_Mahir Baig.pdf',file_dir=r'd:\Sonata_POC\blob',text_dir=os.getcwd())

"--- Page 1 --- Summary Mahir Baig GenAI Engineer Bangalore 9066340827 mahirbaig2@gmail.com I am a highly Accomplished Data Science professional with 4+ years of experience in predictive modeling, advanced analytics, and machine learning, including 2+ years specializing in Generative AI. Expert in developing and deploying LLM-based solutions, prompt engineering, fine-tuning, and multimodal GenAI applications. Skilled in end-to-end data workflows, from analysis and feature engineering to model development and evaluation, and adept at leveraging GenAI to automate processes, enhance business efficiency, and deliver innovative AI-driven solutions.\nK E Y S K I L L S — E X P E R I E N C E Technical Skills.\nPython Generative AI REST API Azure Cognitive services AI Foundry Machine Learning Git & Git Hub Azure (DP100) Analytical Skills Workflow Automation using AI/GenAI Data Visualization Statistical Analysis and Modeling 12/2023 – Present Digital Engineer, Sonata Software Ltd.\n• Served as a