In [5]:
!pip install pymongo transformers torch pinecone python-dotenv tqdm -q
print("Packages installed")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/587.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/259.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.3/259.3 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hPackages installed


In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: GPU not enabled!")
    print("Go to Runtime → Change runtime type → GPU")

Device: cuda
GPU: Tesla T4
VRAM: 15.8 GB


In [7]:
# MongoDB Atlas
MONGO_URI = "mongodb+srv://medicrew_user:***REMOVED***@cluster0.dz0qh3l.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Pinecone
PINECONE_API_KEY = "***REMOVED***"

# Model
MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
BATCH_SIZE = 64  # GPU can handle more

In [8]:
import pymongo
from transformers import AutoTokenizer, AutoModel
from pinecone import Pinecone
from tqdm import tqdm
from datetime import datetime, timezone

# MongoDB
client = pymongo.MongoClient(MONGO_URI)
db = client.medicrew
chunks_collection = db.paper_chunks

total = chunks_collection.count_documents({'embedded': False})
print(f"Chunks to process: {total:,}")

# BioBERT
print("\nLoading BioBERT...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()
print("BioBERT loaded")

# Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("medical-papers-biobert")
print(f"Pinecone connected: {index.describe_index_stats()}")

Chunks to process: 28,548

Loading BioBERT...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

BioBERT loaded
Pinecone connected: {'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 100}},
 'total_vector_count': 100,
 'vector_type': 'dense'}


In [9]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def create_embeddings(texts):
    encoded = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        output = model(**encoded)

    embeddings = mean_pooling(output, encoded['attention_mask'])
    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
    return embeddings.cpu().numpy().tolist()

In [11]:
# Count total
total = chunks_collection.count_documents({'embedded': False})
print(f"Total to process: {total:,}")

# Process in batches (avoid cursor timeout)
BATCH_SIZE = 64
FETCH_BATCH = 500  # Fetch 500 at a time to avoid cursor timeout

processed = 0

with tqdm(total=total, desc="Processing") as pbar:
    while processed < total:
        # Fetch a batch of chunk IDs
        chunks = list(chunks_collection.find(
            {'embedded': False},
            {'_id': 1, 'text': 1}
        ).limit(FETCH_BATCH))

        if not chunks:
            break

        # Process this batch
        for i in range(0, len(chunks), BATCH_SIZE):
            batch = chunks[i:i+BATCH_SIZE]
            texts = [c['text'] for c in batch]
            ids = [c['_id'] for c in batch]

            # Create embeddings
            embeddings = create_embeddings(texts)

            # Update MongoDB
            for chunk_id, embedding in zip(ids, embeddings):
                chunks_collection.update_one(
                    {'_id': chunk_id},
                    {'$set': {
                        'embedding': embedding,
                        'embedded': True,
                        'embedded_at': datetime.now(timezone.utc)
                    }}
                )

            processed += len(batch)
            pbar.update(len(batch))

print(f"Processing complete! Embedded {processed:,} chunks")

Total to process: 21,508


Processing: 100%|██████████| 21508/21508 [47:09<00:00,  7.60it/s]

Processing complete! Embedded 21,508 chunks





In [None]:
# Will add this after embeddings are done
print("Ready to upload to Pinecone")