In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install transformers torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nltk
import torch
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Load abstracts
abstracts = {}
with open("/content/drive/MyDrive/llms/abstracts.txt", "r", encoding="utf-8") as f:
    content = f.read()
    abstract_entries = content.split("\n")

    for entry in abstract_entries:
        if entry and "|--|" in entry:
            parts = entry.split("|--|")
            if len(parts) == 2:
                paper_id = int(parts[0])
                abstract_text = parts[1]
                abstracts[paper_id] = abstract_text
    print(f"Loaded {len(abstracts)} abstracts.")

Loaded 138499 abstracts.


In [None]:
def generate_sentence_bert_embeddings(abstracts, model, batch_size=128):
    """
    Generate sentence-level BERT embeddings for paper abstracts.
    """
    paper_to_sentence_embeddings = {}

    # Generate a default embedding for empty abstracts (encode empty string once)
    default_embedding = model.encode([""])[0]  # Single embedding for empty string

    for paper_id, abstract in tqdm(abstracts.items(), desc="Generating sentence embeddings"):
        if not abstract.strip():
            # Use default embedding for empty abstracts
            paper_to_sentence_embeddings[paper_id] = [default_embedding]
            continue

        # Tokenize abstract into sentences
        sentences = sent_tokenize(abstract)

        # Generate sentence embeddings
        sentence_embeddings = model.encode(sentences, batch_size=batch_size)

        # Store in dict
        paper_to_sentence_embeddings[paper_id] = sentence_embeddings

    return paper_to_sentence_embeddings

In [None]:
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentence_model = sentence_model.to(device)

sentence_bert_embeddings = generate_sentence_bert_embeddings(abstracts, sentence_model)
np.save('/content/drive/MyDrive/llms/all-MiniLM-L6-v2.npy', sentence_bert_embeddings)

Generating sentence embeddings: 100%|██████████| 138499/138499 [19:05<00:00, 120.91it/s]


In [None]:
sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentence_model = sentence_model.to(device)

sentence_bert_embeddings = generate_sentence_bert_embeddings(abstracts, sentence_model)
np.save('/content/drive/MyDrive/llms/all-mpnet-base-v2.npy', sentence_bert_embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating sentence embeddings: 100%|██████████| 138499/138499 [36:40<00:00, 62.95it/s]
