In [1]:
import json
import yaml
import requests
import pandas as pd
from itertools import batched
import os

from tqdm import tqdm

from sentence_transformers import SentenceTransformer

from pinecone import Pinecone

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read pinecone config
with open('pinecone.yml', 'r') as f:
    pinecone_config = yaml.safe_load(f)

In [3]:
# path to a json with papers formatted as [{"title": "...", "abstract": "..."}, ... ]
pap2vec_path = 'data/recommender_systems.json'
device = "cuda" # use "cuda" if you've got a GPU otherwise put "cpu" here

In [4]:
model = SentenceTransformer("sentence-transformers/allenai-specter", device=device)

In [5]:
with open(pap2vec_path, 'r') as file:
    papers_db = json.load(file)

papers_db = [paper for paper in papers_db if paper["title"] is not None]

In [6]:
sep = model.tokenizer.sep_token
embeddings = []

for i, paper in enumerate(papers_db):
    # concat title and abstract as done by the authors https://github.com/allenai/specter
    concat = paper["title"] + sep + paper["abstract"]
    embedding = model.encode(concat).tolist()
    embeddings.append({
        "id": f"id-{i}",
        "values": embedding,
        "metadata": {
            "title": paper["title"],
            "abstract": paper["abstract"]
        }
    })

In [7]:
pc = Pinecone(api_key=os.getenv("PINECONEKEY"))
index = pc.Index(host=pinecone_config["HOST"])

upload_count = len(embeddings)
batch_upload = 100

for batch in tqdm(batched(embeddings, batch_upload)):
    index._upsert_batch(batch, namespace="general_trial", _check_type=True)

20it [04:07, 12.39s/it]
