In [1]:
# 📦 Step 1: Import libraries and load API key from .env
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Ensure your key is loaded
assert OPENAI_API_KEY is not None, "OPENAI_API_KEY not found in .env"


In [3]:
# 📘 Step 2: Load PDF document
from langchain_community.document_loaders import PyMuPDFLoader

# Replace with your file name (ensure it's in the same folder)
loader = PyMuPDFLoader("cricketRules.pdf")
documents = loader.load()


print(f"Loaded {len(documents)} pages")


Loaded 79 pages


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create a text splitter that splits based on characters and overlaps slightly
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,      # Max characters per chunk
    chunk_overlap=50     # Overlap to preserve context
)

# Split the document
chunks = splitter.split_documents(documents)

# Print number of chunks and preview first one
print(f"✅ Document split into {len(chunks)} chunks")
print("🔍 First chunk preview:\n", chunks[0].page_content)


✅ Document split into 527 chunks
🔍 First chunk preview:
 Laws of Cricket 2017 Code (3rd Edition - 2022) 
1 
 
 
 
 
 
 
THE LAWS OF CRICKET 2017 CODE (3rd Edition - 2022) 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
© Marylebone Cricket Club


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create a text splitter that splits based on characters and overlaps slightly
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,      # Max characters per chunk
    chunk_overlap=50     # Overlap to preserve context
)

# Split the document
chunks = splitter.split_documents(documents)

# Print number of chunks and preview first one
print(f"✅ Document split into {len(chunks)} chunks")
print("🔍 First chunk preview:\n", chunks[0].page_content)


✅ Document split into 527 chunks
🔍 First chunk preview:
 Laws of Cricket 2017 Code (3rd Edition - 2022) 
1 
 
 
 
 
 
 
THE LAWS OF CRICKET 2017 CODE (3rd Edition - 2022) 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
© Marylebone Cricket Club


In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import os
from dotenv import load_dotenv

load_dotenv()

# ✅ Use free local embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# ✅ Recreate the vector store from chunks
vectorstore = FAISS.from_documents(chunks, embeddings)

# ✅ Save it locally
vectorstore.save_local("faiss_index")

print("✅ FAISS vector store created and saved using HuggingFaceEmbeddings.")


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm



✅ FAISS vector store created and saved using HuggingFaceEmbeddings.


In [6]:
import os
import openai
from dotenv import load_dotenv

# Load API key
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
assert api_key is not None, "OPENAI_API_KEY not found in .env"

# Initialize client (v1+ syntax)
client = openai.OpenAI(api_key=api_key)

# Your FAISS search step
question = "How do we say that we win the game?"
docs = vectorstore.similarity_search(question, k=3)
context = "\n\n".join([doc.page_content for doc in docs])

# Chat call (new OpenAI API format)
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are an assistant that answers questions based only on the provided context. Do not use any outside information."},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
    ]
)

# Print response
print("📄 Answer from PDF:\n")
print(response.choices[0].message.content)


📄 Answer from PDF:

If the side batting last wins the match without losing all its wickets, the result shall be stated as a win by the number of wickets still then to fall.
