In [1]:
def split_into_segments(content, segment_size=500, overlap_chars=50):
    """
    Divides a given text into smaller segments with an option for overlapping.
    segment_handler.py

    Args:
        content (str): The text content to be divided.
        segment_size (int): Maximum length of each segment.
        overlap_chars (int): Number of characters overlapping between segments.

    Returns:
        list: A collection of text segments.
    """
    segments = []
    position = 0
    while position < len(content):
        limit = min(position + segment_size, len(content))
        segments.append(content[position:limit])
        position += segment_size - overlap_chars
    return segments


In [3]:
from flask import Flask, request, jsonify
import os

# file_handler.py
UPLOAD_DIR = 'data/files'
os.makedirs(UPLOAD_DIR, exist_ok=True)

app = Flask(__name__)

@app.route('/file-upload', methods=['POST'])
def handle_file_upload():
    """
    Handles file uploads via POST requests.
    Validates that the uploaded file is a PDF and saves it to the specified directory.

    Returns:
        JSON response with success or error message.
    """
    if 'upload' not in request.files:
        return jsonify({"error": "Missing file in the request."}), 400

    uploaded_file = request.files['upload']
    if uploaded_file.filename == '':
        return jsonify({"error": "No file selected for upload."}), 400

    if uploaded_file and uploaded_file.filename.endswith('.pdf'):
        saved_path = os.path.join(UPLOAD_DIR, uploaded_file.filename)
        uploaded_file.save(saved_path)
        return jsonify({"message": "File successfully uploaded.", "path": saved_path}), 200
    else:
        return jsonify({"error": "Unsupported file type. Only PDF files are accepted."}), 400


In [4]:
!pip install PyPDF2
# text_extractor.py
from PyPDF2 import PdfReader

def read_pdf_content(pdf_path):
    """
    Reads and extracts text from a PDF document.

    Args:
        pdf_path (str): File path to the PDF document.

    Returns:
        str: The complete text extracted from the PDF.
    """
    pdf_reader = PdfReader(pdf_path)
    extracted_text = ""
    for pdf_page in pdf_reader.pages:
        extracted_text += pdf_page.extract_text()
    return extracted_text


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/232.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:
from sentence_transformers import SentenceTransformer

# vectorizer_controller.py
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')  # Replaceable with another Sentence Transformers model

def create_text_vectors(segments):
    """
    Creates vector embeddings for a list of text segments.

    Args:
        segments (list): Collection of text segments.
    Returns:
        list: A list of generated vector embeddings.
    """
    text_vectors = vectorizer.encode(segments, convert_to_tensor=True)
    return text_vectors


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
!pip install faiss-cpu
import faiss
import os

# index_manager.py
def build_faiss_index(vectors):
    """
    Constructs a FAISS index using the provided vectors.

    Args:
        vectors (list): A list of vectors to be indexed.
    Returns:
        faiss.Index: The resulting FAISS index.
    """
    vector_dimension = vectors[0].shape[0]  # The dimensionality of the vectors
    faiss_index = faiss.IndexFlatL2(vector_dimension)
    faiss_index.add(vectors.cpu().numpy())  # Convert from PyTorch tensor to NumPy array
    return faiss_index

def store_faiss_index(index, file_path):
    """
    Saves the FAISS index to a specified file.

    Args:
        index (faiss.Index): The FAISS index to be saved.
        file_path (str): The destination file path to save the index.
    """
    faiss.write_index(index, file_path)

def retrieve_faiss_index(file_path):
    """
    Loads a FAISS index from a specified file.

    Args:
        file_path (str): The path of the index file to load.
    Returns:
        faiss.Index: The loaded FAISS index.
    """
    if os.path.exists(file_path):
        return faiss.read_index(file_path)
    else:
        raise FileNotFoundError(f"Index file not found at {file_path}")


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [7]:
!pip install pinecone-client


Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

In [8]:
import os
from pinecone import Pinecone, ServerlessSpec

# api_config.py
# Load Pinecone API key from environment variables
PINECONE_API_KEY = "pcsk_4kB2rJ_DJqJbPz1dLnzUCPJhFNAio95LFoZdftVhrxW6aaccKC9HMqk7iaLpKRnPeLUSpC"

# Ensure the directory for vector storage exists
os.makedirs("data/vector_storage", exist_ok=True)

# Initialize Pinecone client
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)

def add_vectors_to_pinecone(index_name, vector_data, ids, metadata):
    """
    Inserts or updates vectors in a Pinecone index.

    Args:
        index_name (str): The name of the Pinecone index.
        vector_data (list): A list of vector embeddings to insert.
        ids (list): A list of IDs associated with each vector.
        metadata (list): A list of metadata text for each vector.

    Returns:
        dict: A dictionary indicating success or failure.
    """
    try:
        # Verify if the index already exists
        if index_name not in pinecone_client.list_indexes().names():
            # Create the index if it does not exist
            pinecone_client.create_index(
                name=index_name,
                dimension=len(vector_data[0]),
                metric="euclidean",  # You can change the similarity metric (e.g., cosine)
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-west-2"  # You can modify the region as needed
                )
            )

        # Access the index instance
        index_instance = pinecone_client.index(index_name)

        # Prepare the data for insertion
        vectors_to_insert = [{"id": str(id_), "vector": vector, "metadata": {"description": text}}
                             for id_, vector, text in zip(ids, vector_data, metadata)]

        # Insert the vectors into Pinecone
        index_instance.upsert(vectors_to_insert)

        return {"message": f"Successfully upserted {len(vector_data)} vectors to '{index_name}'"}
    except Exception as error:
        print(f"An error occurred while upserting vectors: {str(error)}")
        return {"error": str(error)}


In [12]:
from google.colab import files
uploaded = files.upload()


Saving example.pdf to example (1).pdf


In [9]:
"""from src.models.controllers.text_extractor import extract_text_from_pdf
from src.models.controllers.text_chunker import chunk_text
from src.models.controllers.vector_generator import generate_embeddings
from src.models.controllers.index_manager import build_faiss_index, store_faiss_index
from src.models.controllers.pinecone_manager import add_vectors_to_pinecone"""

def process_pdf_pipeline(file_path, store_in_pinecone=False):
    """
    Orchestrates the full pipeline for processing a PDF file, including text extraction, chunking, embedding generation, and storing.

    Args:
        file_path (str): The path to the PDF document.
        store_in_pinecone (bool): Whether to upload the embeddings to Pinecone. Defaults to False.
    """
    print("\n--- Initiating PDF Processing Pipeline ---\n")

    # Step 1: Extract text
    print("[1/4] Extracting text from the PDF file...")
    extracted_text = extract_text_from_pdf(file_path)
    print("Text extraction completed successfully.")

    # Step 2: Split text into chunks
    print("[2/4] Splitting the extracted text into smaller chunks...")
    text_chunks = chunk_text(extracted_text)
    print(f"Text successfully split into {len(text_chunks)} chunks.")

    # Step 3: Generate embeddings for chunks
    print("[3/4] Generating vector embeddings for the chunks...")
    embeddings = generate_embeddings(text_chunks)
    print("Embedding generation completed successfully.")

    # Step 4: Store embeddings
    if store_in_pinecone:
        print("[4/4] Uploading embeddings to Pinecone...")
        add_vectors_to_pinecone("pdf-embedding-index", embeddings, ids=range(len(text_chunks)), metadata=text_chunks)
        print("Embeddings uploaded to Pinecone successfully.")
    else:
        print("[4/4] Saving embeddings in a local FAISS index...")
        faiss_index = build_faiss_index(embeddings)
        store_faiss_index(faiss_index, 'data/vector_storage/index.faiss')
        print("FAISS index saved locally.")

    print("\n--- PDF Processing Pipeline Completed ---\n")


In [None]:
process_pdf_pipeline("example.pdf")


--- Starting PDF Processing Pipeline ---

[1/4] Extracting text from PDF...
Text extraction completed.
[2/4] Splitting text into chunks...
Text split into 37 chunks.
[3/4] Generating embeddings for chunks...
Embeddings generation completed.
[4/4] Storing embeddings in a FAISS index...
FAISS index saved locally.

--- PDF Processing Pipeline Completed ---

