In [1]:
import os
import sys
sys.path.append("..")

import glob
from tqdm import tqdm
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from app import app 
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#### manually configurable parameters
eb_batch_size = 500 # how many papers to embed per batch
up_batch_size = eb_batch_size # how many papers to upload per batch

data_path = "../data/publishers/nature_filtered"

device = app.config["DEVICE"]
scientific_embedding_model = app.config["SCI_EMB_MODEL"]
pinecone_index_name = "nature_filtered_200K"


def read_data_from_path(
        path: str    
    ):
    """ Reads pickled dataframes from a path """
    pickle_files = glob.glob(os.path.join(path,"*.pkl"))
    df = pd.DataFrame()
    for pickle in tqdm(pickle_files):
        new = pd.read_pickle(pickle)
        # print(f"\nloaded file {pickle} with {len(new)} new elements")
        df = pd.concat([df, new])
    return df


print("Reading data from specified path...")
df = read_data_from_path(data_path)
print(f"Loaded {len(df)} papers")

print("Loading scientific embedding model...")
model = SentenceTransformer(scientific_embedding_model, device=device)
separation_token = model.tokenizer.sep_token
print(f"Using separation token: {separation_token}")

print("Filtering out papers with missing abstracts...")
df = df[df["abstract"] != "MISSING_ABSTRACT"].reset_index(drop=True)
print(f"Remaining papers after filtering: {len(df)}")

print("Selecting required fields from papers...")
df = df[["abstract",*app.config["OPENALEX_PAPER_FIELDS"].split(",")]]

print("Concatenating titles and abstracts...")
df["concat"] = df["title"] + separation_token + df["abstract"]

print("Initializing Pinecone client...")
pc = Pinecone(api_key=app.config["PINECONE_KEY"])
index = pc.Index(host=app.config["PINECONE_HOST"])
print("Pinecone client initialized successfully")


Reading data from specified path...


100%|██████████| 19/19 [00:50<00:00,  2.66s/it]


Loaded 209300 papers
Loading scientific embedding model...
Using separation token: [SEP]
Filtering out papers with missing abstracts...
Remaining papers after filtering: 209300
Selecting required fields from papers...
Concatenating titles and abstracts...
Initializing Pinecone client...
Pinecone client initialized successfully


In [4]:

for i, chunk in tqdm(enumerate(np.array_split(df, len(df)//eb_batch_size))):
    print(f"embedding {len(chunk)} new items")
    chunk["embedding"] = chunk["concat"].progress_apply(model.encode)
    new_embeddings = [
        {
            "id": row["doi"],
            "values": row["embedding"].tolist(),
        }
        for _, row in chunk.iterrows()
    ]
    index._upsert_batch(
        new_embeddings, 
        namespace=pinecone_index_name, 
        _check_type=True
        )


  return bound(*args, **kwds)
0it [00:00, ?it/s]

embedding 501 new items


100%|██████████| 501/501 [00:09<00:00, 53.45it/s]
1it [00:34, 34.68s/it]

embedding 501 new items


100%|██████████| 501/501 [00:09<00:00, 54.24it/s]
2it [00:57, 27.49s/it]

embedding 501 new items


100%|██████████| 501/501 [00:09<00:00, 54.67it/s]
2it [01:38, 49.49s/it]


KeyboardInterrupt: 