In [2]:
import os
import glob
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter # Optional: if you also want to see chunks

# Load environment variables from .env file (especially for consistency)
load_dotenv()
print("Attempted to load environment variables from .env")

# Define the name of your data directory
DATA_DIR_NAME = "Data"
# Assuming this notebook is in the project root, like kb.py
project_root = os.getcwd() # For a notebook, os.getcwd() usually gives the notebook's directory
absolute_data_dir = os.path.join(project_root, DATA_DIR_NAME)

print(f"Looking for PDF files in: {absolute_data_dir}")

pdf_files = glob.glob(os.path.join(absolute_data_dir, "*.pdf"))

if not pdf_files:
    print(f"No PDF files found in {absolute_data_dir}.")
else:
    print(f"Found PDF files: {pdf_files}\\n")

    for pdf_path in pdf_files:
        print(f"--- Processing PDF: {os.path.basename(pdf_path)} ---")
        try:
            loader = PyPDFLoader(pdf_path)
            # PyPDFLoader loads each page as a separate Document object
            pages = loader.load() 
            
            print(f"  Number of pages loaded: {len(pages)}\\n")
            
            for i, page_document in enumerate(pages):
                print(f"  --- Page {i + 1} (Original Page Num from PDF: {page_document.metadata.get('page', 'N/A')}) ---")
                print(f"  Source: {page_document.metadata.get('source', os.path.basename(pdf_path))}")
                
                # Print a snippet of the page content
                content_snippet = page_document.page_content[:2500].replace('\\n', ' ') # First 500 chars, newlines replaced for readability
                print(f"  Content Snippet:\\n  ```\\n  {content_snippet}...\n  ```\\n")

               

        except Exception as e:
            print(f"  Error processing PDF file {pdf_path}: {e}")
        print(f"--- Finished processing {os.path.basename(pdf_path)} ---\\n{'='*40}\\n")

print("PDF content inspection finished.")

Attempted to load environment variables from .env
Looking for PDF files in: s:\Math_Agent\Data
Found PDF files: ['s:\\Math_Agent\\Data\\MATHEMATICS.pdf']\n
--- Processing PDF: MATHEMATICS.pdf ---
  Number of pages loaded: 26\n
  --- Page 1 (Original Page Num from PDF: 0) ---
  Source: s:\Math_Agent\Data\MATHEMATICS.pdf
  Content Snippet:\n  ```\n  MATHEMATICS 
Question (1) 
If z is a complex number such that |z - 2 - 3i| + |z + 1 - i| = 4, then the locus of z is: 
Options: 
A) A circle 
B) An ellipse 
C) A hyperbola 
D) A line segment 
Mathematically Correct Solution (Based on the question as written): 
1. Rewrite the equation: |z - (2 + 3i)| + |z - (-1 + i)| = 4. 
2. This is in the form |z - z₁| + |z - z₂| = k, where: 
o z₁ = 2 + 3i (first focus) 
o z₂ = -1 + i (second focus) 
o k = 4 (the constant sum of distances) 
3. Calculate the distance between the foci, d = |z₁ - z₂|: 
d = |(2 + 3i) - (-1 + i)| = |(2 - (-1)) + (3 - 1)i| = |3 + 2i| 
d = √(3² + 2²) = √(9 + 4) = √13. 
4. Compare k

In [3]:
# Create a text splitter with the same parameters as in kb.py
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Process the first PDF file to demonstrate chunking
if pdf_files:
    pdf_path = pdf_files[0]
    print(f"\nChunking content from: {os.path.basename(pdf_path)}")
    
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        
        # Split the documents into chunks
        chunked_documents = text_splitter.split_documents(documents)
        
        # Print first 5 chunks
        print(f"\nFirst 5 chunks from {os.path.basename(pdf_path)}:")
        for i, chunk in enumerate(chunked_documents[:5]):
            print(f"\n--- Chunk {i + 1} ---")
            print(f"Source: {chunk.metadata.get('source', os.path.basename(pdf_path))}")
            print(f"Page: {chunk.metadata.get('page', 'N/A')}")
            print(f"Content:\n{chunk.page_content[:500]}...\n")
            print("-" * 40)
            
    except Exception as e:
        print(f"Error processing PDF file {pdf_path}: {e}")



Chunking content from: MATHEMATICS.pdf

First 5 chunks from MATHEMATICS.pdf:

--- Chunk 1 ---
Source: s:\Math_Agent\Data\MATHEMATICS.pdf
Page: 0
Content:
MATHEMATICS 
Question (1) 
If z is a complex number such that |z - 2 - 3i| + |z + 1 - i| = 4, then the locus of z is: 
Options: 
A) A circle 
B) An ellipse 
C) A hyperbola 
D) A line segment 
Mathematically Correct Solution (Based on the question as written): 
1. Rewrite the equation: |z - (2 + 3i)| + |z - (-1 + i)| = 4. 
2. This is in the form |z - z₁| + |z - z₂| = k, where: 
o z₁ = 2 + 3i (first focus) 
o z₂ = -1 + i (second focus) 
o k = 4 (the constant sum of distances) 
3. Calculate the dis...

----------------------------------------

--- Chunk 2 ---
Source: s:\Math_Agent\Data\MATHEMATICS.pdf
Page: 0
Content:
constant k which is greater than the distance between z₁ and z₂, the locus of z is an 
ellipse. 
Right Option (mathematically derived): B) An ellipse 
(Note: The "Right option D" initially stated in the prompt for this que

In [2]:
# test_pinecone.py
import os
from dotenv import load_dotenv
import pinecone

# Load environment variables
load_dotenv()

# Get environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")

def test_pinecone_connection():
    """Test connection to Pinecone."""
    try:
        # Initialize Pinecone
        pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
        
        # List available indexes
        indexes = pc.list_indexes()
        print(f"Available indexes: {indexes}")
        
        # Connect to the index
        index = pc.Index(PINECONE_INDEX_NAME)
        
        # Get index stats
        stats = index.describe_index_stats()
        print(f"Index stats: {stats}")
        
        print("Successfully connected to Pinecone!")
        return True
    except Exception as e:
        print(f"Error connecting to Pinecone: {e}")
        return False

if __name__ == "__main__":
    print("Testing Pinecone connection...")
    test_pinecone_connection()


Testing Pinecone connection...
Error connecting to Pinecone: module 'pinecone' has no attribute 'Pinecone'
