In [None]:
!pip install --quiet langchain langchain-community langchain-google-genai faiss-cpu google-generativeai python-dotenv PyPDF2 docx2txt pandas openpyxl pillow

In [None]:
!pip install easyocr

In [1]:
import os
from dotenv import load_dotenv
import google.generativeai as genai

# Load environment variables from the .env file
load_dotenv()

# Check and print partial key for confirmation
api_key = os.getenv("GOOGLE_API_KEY")

if not api_key:
    raise ValueError("❌ GOOGLE_API_KEY not found. Please check your .env file.")
else:
    print("✅ Google API key loaded:", api_key[:10], "********")

# Configure Google Generative AI client
genai.configure(api_key=api_key)

# Quick test to verify API connection
try:
    test_model = genai.GenerativeModel("gemini-2.0-flash")
    response = test_model.generate_content("Test connection to Gemini API.")
    print("✅ Gemini connection successful! Model responded.")
except Exception as e:
    print("❌ Gemini API connection failed:", e)

✅ Google API key loaded: AIzaSyBARx ********
✅ Gemini connection successful! Model responded.


In [2]:
import os
import easyocr

# LangChain loaders
from langchain_community.document_loaders import (
    TextLoader, PyPDFLoader, JSONLoader, Docx2txtLoader, CSVLoader, UnstructuredExcelLoader
)

# Compatible import for Document
try:
    from langchain_core.documents import Document
except ImportError:
    from langchain.schema import Document

# Initialize EasyOCR (English only; add other languages if needed)
ocr_reader = easyocr.Reader(['en'], gpu=False)

# Folder path for your data
input_folder = "input"
if not os.path.exists(input_folder):
    raise FileNotFoundError(f"❌ Folder not found: {input_folder}")

# Supported file extensions
supported_exts = [".txt", ".md", ".pdf", ".docx", ".json", ".csv", ".xlsx", ".xls", ".jpeg", ".jpg", ".png"]

# Storage for all documents
all_docs = []

# OCR function using EasyOCR
def load_image_text(image_path):
    """Extract text from images using EasyOCR."""
    try:
        results = ocr_reader.readtext(image_path, detail=0)
        text = "\n".join(results)
        return text.strip()
    except Exception as e:
        print(f"⚠️ EasyOCR failed for {image_path}: {e}")
        return ""

# Iterate through files in input folder
for filename in os.listdir(input_folder):
    filepath = os.path.join(input_folder, filename)
    ext = os.path.splitext(filename)[-1].lower()

    if ext not in supported_exts:
        print(f"⏩ Skipping unsupported file: {filename}")
        continue

    print(f"📄 Loading: {filepath}")
    try:
        if ext in [".txt", ".md"]:
            loader = TextLoader(filepath)
            docs = loader.load()
        elif ext == ".pdf":
            loader = PyPDFLoader(filepath)
            docs = loader.load()
        elif ext == ".docx":
            loader = Docx2txtLoader(filepath)
            docs = loader.load()
        elif ext == ".json":
            try:
                loader = JSONLoader(filepath, jq_schema=".", text_content=False)
                docs = loader.load()
            except Exception:
                import json
                with open(filepath, "r", encoding="utf-8") as f:
                    data = json.load(f)
                docs = [Document(page_content=str(data), metadata={"source": filename})]
        elif ext == ".csv":
            loader = CSVLoader(filepath)
            docs = loader.load()
        elif ext in [".xlsx", ".xls"]:
            loader = UnstructuredExcelLoader(filepath)
            docs = loader.load()
        elif ext in [".jpeg", ".jpg", ".png"]:
            text = load_image_text(filepath)
            docs = [Document(page_content=text, metadata={"source": filename})] if text else []
        else:
            docs = []

        all_docs.extend(docs)
    except Exception as e:
        print(f"⚠️ Failed to load {filename}: {e}")

print(f"\n✅ Total documents loaded: {len(all_docs)}")

Using CPU. Note: This module is much faster with a GPU.


📄 Loading: input\20251014-174708_Top AC Repa.csv
📄 Loading: input\AI and Plag Report_chunks.md
📄 Loading: input\Embeddings and Vector Search.jpeg


: 

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # ~1000 characters per chunk
    chunk_overlap=200,    # overlap preserves context between chunks
    length_function=len,
)

# Split all loaded documents into chunks
chunks = text_splitter.split_documents(all_docs)

print(f"✅ Chunks created: {len(chunks)} from {len(all_docs)} source documents.\n")

# Preview first chunk
if chunks:
    print("Example chunk preview:\n")
    print(chunks[0].page_content[:500] + "...")
else:
    print("No chunks created.")

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Local embedding model (no API required)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Build FAISS index
vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)

# Create retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

print("✅ FAISS vector store created successfully using HuggingFace embeddings.")
