In [2]:
import os
import re
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader

load_dotenv()

def load_pdf_document(file_path):
    """Load PDF document using PyPDFLoader"""
    loader = PyPDFLoader(file_path)
    
    
    pages = loader.load()
    
    print(f"Loaded {len(pages)} pages from {file_path}")
    print(f"First page metadata: {pages[0].metadata}")
    print(f"First page content preview: {pages[0].page_content[:200]}...")
    
    return pages

pdf_path = "C:/Users/mehul/Documents/CP/Langchain Practice/data/AI Training Document.pdf"
documents = load_pdf_document(pdf_path)


Loaded 20 pages from C:/Users/mehul/Documents/CP/Langchain Practice/data/AI Training Document.pdf
First page metadata: {'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-06-18T14:59:12+05:30', 'author': 'Shivani Gupta', 'moddate': '2025-06-18T14:59:12+05:30', 'source': 'C:/Users/mehul/Documents/CP/Langchain Practice/data/AI Training Document.pdf', 'total_pages': 20, 'page': 0, 'page_label': '1'}
First page content preview: User Agreement 
1. Introduction 
This User Agreement, the Mobile Application Terms of Use, and all policies and additional terms 
posted on and in our sites, applications, tools, and services (collect...


In [3]:
def merge_and_clean_text(documents):
    """Merge all pages into a single text and remove all newline characters"""
    # Combine all page content
    combined_text = " ".join([doc.page_content for doc in documents])
    
    # Remove ALL newline characters and replace with spaces
    cleaned_text = combined_text.replace('\n', ' ')
    
    # Remove multiple spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    # Clean up spacing around punctuation
    cleaned_text = re.sub(r'\s+([.!?])', r'\1', cleaned_text)  # Remove spaces before punctuation
    cleaned_text = re.sub(r'([.!?])\s+([A-Z])', r'\1 \2', cleaned_text)  # Fix sentence spacing
    
    return cleaned_text.strip()

final_text = merge_and_clean_text(documents)

In [7]:
with open("documents.txt", "w", encoding="utf-8") as f:
    f.write(final_text)


In [4]:
import pickle
# Chunk the documents into 100–300 word segments using sentence-aware splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=lambda x: len(x.split())
)
chunked_documents = text_splitter.split_text(final_text)
with open("C:\\Users\\mehul\\Documents\\CP\\Langchain Practice\\chunks\\chunked_documents", "wb") as f:
    pickle.dump(chunked_documents, f)



In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the all-MiniLM-L6-v2 model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print(f"Model loaded successfully!")
print(f"Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")  # Should be 384


Collecting sentence_transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Using cached torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence_transformers)
  Using cached scikit_learn-1.7.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting scipy (from sentence_transformers)
  Downloading scipy-1.16.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting huggingface-hub>=0.20.0 (from sentence_transformers)
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading 

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded successfully!
Embedding dimension: 384


In [11]:
def generate_embeddings_minilm(chunked_documents):
    """Generate embeddings using all-MiniLM-L6-v2"""
    
    # Generate embeddings
    print(f"Generating embeddings for {len(chunked_documents)} chunks...")
    embeddings = embedding_model.encode(
        chunked_documents,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True  # Normalize for cosine similarity
    )
    
    print(f"Generated embeddings shape: {embeddings.shape}")
    return embeddings

# Generate embeddings from your chunked_documents
embeddings = generate_embeddings_minilm(chunked_documents)


Generating embeddings for 55 chunks...


Batches: 100%|██████████| 2/2 [00:01<00:00,  1.28it/s]

Generated embeddings shape: (55, 384)





In [12]:
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Configuration
index_name = "amlgo-chatbot-embeddings"
dimension = 384 

# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    print(f"Created new index: {index_name}")
else:
    print(f"Using existing index: {index_name}")

# Connect to the index
index = pc.Index(index_name)


Created new index: amlgo-chatbot-embeddings


In [21]:
def store_embeddings_in_pinecone(embeddings, index):
    for i, embedding in enumerate(embeddings):
        print(f"Storing embedding for chunk {i}, {embedding}")
        index.upsert(
            vectors=[
                {
                    "id": f"doc_{i}",
                    "values": embedding,
                    "metadata": {"text": chunked_documents[i]}
                }
            ]
        )

store_embeddings_in_pinecone(embeddings, index)

Storing embedding for chunk 0, [-4.65473607e-02  8.07212852e-03  2.78511904e-02 -1.05174169e-01
 -3.49911134e-04  6.79869205e-02  3.94692272e-03  1.67666655e-02
 -9.34441853e-03 -2.66741458e-02  6.82764426e-02  2.81989817e-02
  5.19565269e-02 -1.03031863e-02  7.79542178e-02 -7.49841426e-03
  6.43978938e-02 -4.95616980e-02 -3.89309525e-02 -2.49494687e-02
  5.36335781e-02 -1.99313518e-02 -6.88968226e-02 -9.95762553e-03
 -2.17209309e-02 -5.56986295e-02 -5.34617044e-02  9.72388759e-02
 -8.46598018e-03 -3.87397851e-03 -1.03830220e-02  1.20841702e-04
  2.12700292e-02  7.78680593e-02 -2.57834196e-02 -1.13462895e-01
 -9.11997408e-02 -9.92905051e-02 -8.13356489e-02 -9.37681869e-02
 -2.10493878e-02  1.63168665e-02 -1.06240362e-01  7.18470737e-02
  5.77324331e-02  1.06583433e-02  1.43119060e-02  6.27528802e-02
 -9.07701813e-03  5.07266372e-02  1.27981335e-01  4.80152341e-03
  3.81643847e-02 -5.62316459e-03 -2.52860803e-02 -4.68052253e-02
  3.58944573e-02  4.29847948e-02  7.53863901e-02 -2.5880703

In [22]:
# Check index statistics
stats = index.describe_index_stats()
print(f"Index statistics:")
print(f"Total vectors: {stats['total_vector_count']}")
print(f"Dimension: {stats['dimension']}")
print(f"Index fullness: {stats['index_fullness']}")


Index statistics:
Total vectors: 55
Dimension: 384
Index fullness: 0.0


In [24]:
def test_similarity_search(query_text, top_k=5):
    """Test similarity search with a query"""
    
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query_text], convert_to_numpy=True, normalize_embeddings=True)
    
    # Search in Pinecone
    results = index.query(
        vector=query_embedding[0].tolist(),
        top_k=top_k,
        include_metadata=True
    )
    
    print(f"Query: '{query_text}'")
    print(f"Found {len(results['matches'])} similar chunks:\n")
    
    for i, match in enumerate(results['matches'], 1):
        print(f"--- Result {i} (Score: {match['score']:.4f}) ---")
        print(f"Text: {match['metadata']['text'][:200]}...")
        print()
    
    return results

# Test the search
test_results = test_similarity_search("What is the main topic of this document?")
print(test_results)

Query: 'What is the main topic of this document?'
Found 5 similar chunks:

--- Result 1 (Score: 0.1430) ---
Text: Demands); (2) appoint one arbitrator for each batch; and (3) provide for the resolution of each batch on a consolidated basis with one set of filing and administrative fees due per batch, one procedur...

--- Result 2 (Score: 0.1383) ---
Text: below) shall select the administrator. A party who wishes to initiate arbitration must provide the other party with a demand for arbitration (the "Demand"). A Demand to eBay should be sent by mail to ...

--- Result 3 (Score: 0.0985) ---
Text: own motion or a party’s, and after affording a reasonable opportunity to respond, an arbitrator determines that a party who commenced arbitration did not bring its claim(s) consistent with counsel’s C...

--- Result 4 (Score: 0.0920) ---
Text: This section does not prevent you or eBay from participating in a class-wide settlement of claims. 4. Arbitration Procedures Arbitration is more informal 