In [1]:
# ─── [0] Kernel‐wide settings: put this in the very first cell, then restart! ─────────
import os, warnings

# 0a) turn off all HF‐hub progress bars & symlink warnings
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_DISABLE_SYMLINKS_WARNING"]    = "1"

# 0b) drop all FutureWarning / UserWarning noise
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# 0c) silence the transformers logger
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()


In [2]:
# 1) mount Drive and imports
from google.colab import drive
from pathlib import Path

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm

drive.mount("/content/drive")

# 2) paths
DATA_DIR = Path("/content/drive/MyDrive")
csv_in   = DATA_DIR / "79k_topic_entity_labeled.csv"
csv_out  = DATA_DIR / "79k_topic_entity_labeled_embedded.csv"

# 3) load texts
df    = pd.read_csv(csv_in, low_memory=False)
texts = df["chunk"].fillna("").tolist()

# 4) load model
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModel.from_pretrained(model_name).eval().to("cuda")

# 5) pooling fn
def mean_pooling(out, mask):
    token_emb = out.last_hidden_state
    mask_exp   = mask.unsqueeze(-1).expand(token_emb.size()).float()
    summed     = (token_emb * mask_exp).sum(1)
    count      = mask_exp.sum(1).clamp(min=1e-9)
    return summed / count

# 6) embed with a live bar, larger batch
batch_size  = 128   # ← increased batch size
total_docs  = len(texts)
all_embs    = []

pbar = tqdm(total=total_docs, desc="Embedding docs", unit="doc")
for start in range(0, total_docs, batch_size):
    batch = texts[start : start + batch_size]
    enc   = tokenizer(batch, padding=True, truncation=True,
                      max_length=512, return_tensors="pt").to("cuda")
    with torch.no_grad():
        out = model(**enc)
    embs = mean_pooling(out, enc["attention_mask"]).cpu().numpy()
    all_embs.append(embs)
    pbar.update(len(batch))
pbar.close()

# 7) stack & vectorized df join
embs   = np.vstack(all_embs)
cols   = [f"emb_{i}" for i in range(embs.shape[1])]
emb_df = pd.DataFrame(embs, columns=cols, index=df.index)
df     = pd.concat([df, emb_df], axis=1)

# 8) save
df.to_csv(csv_out, index=False)
print("✅ Embeddings saved to", csv_out)


Mounted at /content/drive


Embedding docs:   0%|          | 0/390529 [00:00<?, ?doc/s]

✅ Embeddings saved to /content/drive/MyDrive/79k_topic_entity_labeled_embedded.csv
