In [None]:
import json
import pickle
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pandas as pd

SAVE_PATH = Path('../model') 
SAVE_PATH.mkdir(parents=True, exist_ok=True)

SAVE_EVERY = 100  
BATCH_SIZE = 32
checkpoint_path = SAVE_PATH / 'checkpoint.pkl'

json_path = '../resources/arxiv-metadata-oai-snapshot.json'

corpus = []
ids = []

with open(json_path, 'r', encoding='utf-8') as f:
    for line in tqdm(f, desc="Reading JSON"):
        record = json.loads(line)
        if 'cs.' in record.get('categories', ''):
            title = record['title'].strip().replace('\n', ' ')
            abstract = record['abstract'].strip().replace('\n', ' ')
            corpus.append(f"{title}. {abstract}")
            ids.append(record['id'])

df = pd.DataFrame({'id': ids})
print(f"📚 Loaded {len(corpus)} CS papers.")

model = SentenceTransformer('allenai-specter')

if checkpoint_path.exists():
    with open(checkpoint_path, 'rb') as f:
        checkpoint = pickle.load(f)
        embeddings = checkpoint['embeddings']
        start_idx = checkpoint['start_idx']
    print(f"🔄 Resuming from batch {start_idx // BATCH_SIZE}")
else:
    embeddings = []
    start_idx = 0

for i in tqdm(range(start_idx, len(corpus), BATCH_SIZE), desc="🔁 Encoding"):
    batch = corpus[i:i + BATCH_SIZE]
    batch_embeddings = model.encode(batch, convert_to_numpy=True)
    embeddings.extend(batch_embeddings)

    if (i // BATCH_SIZE) % SAVE_EVERY == 0:
        with open(checkpoint_path, 'wb') as f:
            pickle.dump({'embeddings': embeddings, 'start_idx': i + BATCH_SIZE}, f)

with open(SAVE_PATH / 'arxiv_embeddings.pkl', 'wb') as f:
    pickle.dump({
        'ids': df['id'].tolist(),
        'texts': corpus,
        'embeddings': np.array(embeddings)
    }, f)

checkpoint_path.unlink(missing_ok=True)

model.save(str(SAVE_PATH / 'sentence_model'))
print("✅ Finished. All data and model saved.")


  from .autonotebook import tqdm as notebook_tqdm





Loading records: 2744489it [00:58, 46880.02it/s]


Loaded 1017166 computer science papers.


Batches:   3%|▎         | 964/31787 [8:22:34<393:30:56, 45.96s/it]