In [1]:
import json
import requests
import pandas as pd

from tqdm import tqdm

import chromadb
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# path to a json with papers formatted as [{"title": "...", "abstract": "..."}, ... ]
pap2vec_path = 'data/recommender_systems.json'
device = "cuda" # use "cuda" if you've got a GPU otherwise put "cpu" here

In [3]:
model = SentenceTransformer("sentence-transformers/allenai-specter", device=device)

In [4]:
with open(pap2vec_path, 'r') as file:
    papers_db = json.load(file)

papers_db = [paper for paper in papers_db if paper["title"] is not None]

In [5]:
sep = model.tokenizer.sep_token
# concat title and abstract as done by the authors https://github.com/allenai/specter
db_tabs_concat = [d["title"] + sep + d["abstract"] for d in papers_db]

In [6]:
embeddings = model.encode(db_tabs_concat)

In [7]:
chroma_client = chromadb.PersistentClient(path="cool_data/recommender_systems")
collection = chroma_client.create_collection(name="recommender_systems")

for i, (paper, embedding) in tqdm(enumerate(zip(papers_db, embeddings))):
    collection.add(
        documents=[paper["title"] + sep + paper["abstract"]],
        embeddings=[embedding.tolist()],
        metadatas=[{"title": paper["title"], "abstract": paper["abstract"]}],
        ids=[str(i)]
    )


2000it [00:25, 77.85it/s]
