In [None]:
import re
import logging
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

from pdfminer.high_level import extract_text as pdfminer_extract_text
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")



In [None]:

# Initialize the embedding model and Qdrant client
model = SentenceTransformer('all-MiniLM-L6-v2')
vector_dim = model.get_sentence_embedding_dimension()
client = QdrantClient(host='localhost', port=6333)

collection_name = "Olabs_books"

# Recreate the Qdrant collection with required configuration
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_dim, distance="Cosine")
)

# Define regex patterns and sets for metadata extraction
class_pattern = re.compile(r"Class\s*\d+")
doc_types = {"activities", "Lab Manual", "Projects",}
subject_keywords = {"science", "maths", "biology", "chemistry", "physics"}



2025-02-28 15:34:26,817 [INFO] Use pytorch device_name: cuda
2025-02-28 15:34:26,818 [INFO] Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-02-28 15:34:32,728 [INFO] HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"
  client.recreate_collection(
2025-02-28 15:34:32,748 [INFO] HTTP Request: DELETE http://localhost:6333/collections/Olabs_final "HTTP/1.1 200 OK"
2025-02-28 15:34:33,192 [INFO] HTTP Request: PUT http://localhost:6333/collections/Olabs_final "HTTP/1.1 200 OK"


In [None]:

def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract text from a PDF using pdfminer.six and remove excessive whitespace."""
    try:
        text = pdfminer_extract_text(str(pdf_path))
        if text:
            return " ".join(text.split())
        else:
            logging.warning(f"No text found in {pdf_path}")
            return ""
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        return ""

def chunk_text(text: str, chunk_size: int = 200, overlap: int = 50) -> list:
    """
    Split text into chunks.
    Default chunk_size is 200 words with an overlap of 50 words.
    """
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        if chunk:
            chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

def extract_metadata(file_path: Path) -> dict:
    """
    Extract metadata from the file path based on folder names.
    Expected metadata includes 'class', 'doc_type', and 'subject'.
    """
    metadata = {"class": None, "doc_type": None, "subject": None}
    for part in file_path.parts:
        if metadata["class"] is None:
            m = class_pattern.search(part)
            if m:
                metadata["class"] = m.group().strip()
        if metadata["doc_type"] is None and part.lower() in {d.lower() for d in doc_types}:
            metadata["doc_type"] = part
        if metadata["subject"] is None:
            for keyword in subject_keywords:
                if keyword in part.lower():
                    metadata["subject"] = part
                    break
    return metadata

def process_pdf(pdf_file: Path, id_num: int, chunk_size: int = 200, overlap: int = 50) -> list:
    """
    Process a PDF file by extracting its text, chunking into 200-word segments,
    generating embeddings for each chunk, and attaching metadata.
    Returns a list of Qdrant PointStruct objects.
    """
    text = extract_text_from_pdf(pdf_file)
    if not text:
        logging.warning(f"Skipping {pdf_file} due to no extractable text.")
        return []
    
    chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
    metadata = extract_metadata(pdf_file)
    metadata["file_name"] = pdf_file.name
    metadata["pdf_path"] = str(pdf_file)
    
    points = []
    for idx, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()
        payload = metadata.copy()
        payload["chunk_index"] = idx
        payload["chunk_text"] = chunk  # store the full chunk text
        point = PointStruct(
            id=id_num,
            vector=embedding,
            payload=payload
        )
        points.append(point)
        id_num += 1
    return points

def process_dataset(root_dir: Path, max_workers: int = 4, batch_size: int = 50):
    """
    Traverse the dataset directory, process PDFs concurrently,
    upsert the resulting points into Qdrant in batches, and log progress.
    """
    all_points = []
    id_counter = 1

    pdf_files = list(root_dir.rglob("*.pdf"))
    total_files = len(pdf_files)
    logging.info(f"Found {total_files} PDF files to process.")

    processed_count = 0
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(process_pdf, pdf_file, id_counter + idx): pdf_file
            for idx, pdf_file in enumerate(pdf_files)
        }
        for future in as_completed(futures):
            processed_count += 1
            points = future.result()
            if points:
                all_points.extend(points)
            else:
                logging.info(f"File skipped: {futures[future]}")
            logging.info(f"Processed {processed_count} of {total_files} PDFs.")

    logging.info(f"Successfully processed {len(all_points)} chunks from PDFs.")

    # Upsert points into Qdrant in batches
    for i in range(0, len(all_points), batch_size):
        batch = all_points[i:i+batch_size]
        client.upsert(collection_name=collection_name, points=batch)
        logging.info(f"Upserted batch {(i // batch_size) + 1} with {len(batch)} points.")



In [None]:

if __name__ == "__main__":
    # Update this path to your dataset root directory
    dataset_root = Path(r"N:\CAI\hackathon\olabs\dataset")
    process_dataset(dataset_root)