### Step 1: Initialize Pinecone

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
pinecone_api = os.getenv("PINECONE_API_KEY")

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key = pinecone_api)

# Connecting to a serverless index
index_name = "new-test"
index = pc.Index(index_name)


  from tqdm.autonotebook import tqdm


### Step 2: Load Hugging Face Model and Tokenizer

In [5]:
import torch
from transformers import AutoTokenizer, AutoModel

model_name = "abhinand/MedEmbed-large-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to generate embeddings
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Pooling the token embeddings
    return embeddings.squeeze().tolist()  # Ensure the output is a flat list

### Step 3: Define Functions for Text and Table Embeddings

In [56]:
import PyPDF2
import pdfplumber
import uuid

# Function to extract text from a PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to extract tables from a PDF using pdfplumber
def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables_on_page = page.extract_tables()
            tables.extend(tables_on_page)
    return tables

### Step 4: Process PDF files in the folder

In [57]:
# Helper function to chunk text for large documents
def chunk_text(text, max_length=512):
    words = text.split()
    for i in range(0, len(words), max_length):
        yield ' '.join(words[i:i+max_length])

# Step 4: Process PDF files in the folder and upsert with UUIDs and metadata
def process_pdfs_in_folder(folder_path):
    for file_idx, file_name in enumerate(os.listdir(folder_path)):
        if file_name.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, file_name)
            print(f"Processing {file_name}...")

            # Generate a base UUID for the document
            document_uuid = uuid.uuid4()  # Unique UUID for the document

            # Step 4.1: Extract and Embed Text
            text = extract_text_from_pdf(pdf_path)
            text_chunks = list(chunk_text(text))  # Create a list of text chunks

            for chunk_idx, chunk in enumerate(text_chunks):
                # Generate embeddings for each chunk of text
                text_embedding = generate_embeddings(chunk)
                
                # Generate a unique UUID for each chunk (combining doc UUID + chunk index)
                chunk_uuid = f"{document_uuid}_text_chunk_{chunk_idx}"
                
                # Prepare metadata, here we're storing the original text
                metadata = {
                    "document_id": str(document_uuid), 
                    "chunk_id": chunk_idx, 
                    "text": chunk}

                # Upsert into Pinecone using the UUID and metadata
                index.upsert(vectors=[(chunk_uuid, text_embedding, metadata)])

            # Step 4.2: Extract and Embed Tables
            tables = extract_tables_from_pdf(pdf_path)
            for table_idx, table in enumerate(tables):
                # Table rows extraction
                for row_idx, row in enumerate(table):
                    row_string = ' '.join([str(cell) for cell in row])
                    row_embedding = generate_embeddings(row_string)
                    
                    # Generate unique UUID for each table row (combining doc UUID + table + row index)
                    row_uuid = f"{document_uuid}_table_{table_idx}_row_{row_idx}"

                    # Prepare metadata for the table row
                    metadata = {
                        "document_id": str(document_uuid),
                        "table_id": table_idx,
                        "row_id": row_idx,
                        "text": row_string
                    }

                    # Upsert into Pinecone using the UUID and metadata
                    index.upsert(vectors=[(row_uuid, row_embedding, metadata)])

### Step 5: Define folder path containing PDFs and run the embedding process

In [None]:
folder_path = "K:/USMLE-RAG/artifacts/raw"
process_pdfs_in_folder(folder_path)

Processing USMLE_Step1_2023-part-1.pdf...
Processing USMLE_Step1_2023-part-10.pdf...
Processing USMLE_Step1_2023-part-11.pdf...
Processing USMLE_Step1_2023-part-12.pdf...
Processing USMLE_Step1_2023-part-13.pdf...
Processing USMLE_Step1_2023-part-14.pdf...
Processing USMLE_Step1_2023-part-15.pdf...
Processing USMLE_Step1_2023-part-16.pdf...
Processing USMLE_Step1_2023-part-17.pdf...
Processing USMLE_Step1_2023-part-18.pdf...
Processing USMLE_Step1_2023-part-19.pdf...
Processing USMLE_Step1_2023-part-2.pdf...
Processing USMLE_Step1_2023-part-3.pdf...
Processing USMLE_Step1_2023-part-4.pdf...
Processing USMLE_Step1_2023-part-5.pdf...
Processing USMLE_Step1_2023-part-6.pdf...
Processing USMLE_Step1_2023-part-7.pdf...
Processing USMLE_Step1_2023-part-8.pdf...
Processing USMLE_Step1_2023-part-9.pdf...


### Step 6: Example: Query Pinecone for similar text or table rows

In [7]:
# Step 6: Example: Query Pinecone for similar text or table rows
def query_pinecone(query_text, top_k=5):
    query_embedding = generate_embeddings(query_text)
    result = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    return result

# Example query
query_result = query_pinecone("explain modes of inheritance")
print(query_result)

{'matches': [{'id': '1f11975c-04a7-48ac-bce7-e6133b643121_table_112_row_0',
              'metadata': {'document_id': '1f11975c-04a7-48ac-bce7-e6133b643121',
                           'row_id': 0.0,
                           'table_id': 112.0,
                           'text': 'TYPE INHERITANCE PATHOGENESIS \x8f BlOOD '
                                   'lEVEl ClINICAl'},
              'score': 0.74628615,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '147c8efc-5c70-49b0-aae5-b820c8b0cfe6_table_5_row_15',
              'metadata': {'document_id': '147c8efc-5c70-49b0-aae5-b820c8b0cfe6',
                           'row_id': 15.0,
                           'table_id': 5.0,
                           'text': 'Genetics 5–9   '},
              'score': 0.72350407,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '1f11975c-04a7-48ac-bce7-e6133b643121_table_62_row