In [1]:
import os
import sys
sys.path.append("..")
from itertools import batched
import glob
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from app import app 

tqdm.pandas()

In [7]:

#### manually configurable parameters
eb_batch_size = 500 # how many papers to embed per batch
up_batch_size = 800 # how many papers to upload per batch

data_path = "../data/publishers/nature_filtered"

device = app.config["DEVICE"]
scientific_embedding_model = app.config["SCI_EMB_MODEL"]
pinecone_index_name = "nature_filtered_200K"


In [3]:
print("Reading available data files...")
pickle_files = glob.glob(os.path.join(data_path,"*.pkl"))
print("Loading scientific embedding model...")
model = SentenceTransformer(scientific_embedding_model, device=device)
separation_token = model.tokenizer.sep_token
print(f"Using separation token: {separation_token}")
print("Initializing Pinecone client...")
pc = Pinecone(api_key=app.config["PINECONE_KEY"])
index = pc.Index(host=app.config["PINECONE_HOST"])
print("Pinecone client initialized successfully")


Reading available data files...
Loading scientific embedding model...
Using separation token: [SEP]
Initializing Pinecone client...
Pinecone client initialized successfully


In [4]:
totdf = pd.DataFrame()

for pickle in pickle_files:
    
    print(f"Reading data from {pickle}...")
    df = pd.read_pickle(pickle).drop_duplicates(subset=["doi"])
    print(f"Loaded {len(df)} papers")
    print("Concatenating titles and abstracts...")
    df["concat"] = df["title"] + separation_token + df["abstract"]
    print("Embedding papers")
    df["embedding"] = df["concat"].progress_apply(model.encode)    
    # Create batch for upload
    totdf = pd.concat([totdf, df])

Reading data from ../data/publishers/nature_filtered/data_slice_7.pkl...
Loaded 670 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/670 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_14.pkl...
Loaded 2565 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/2565 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_12.pkl...
Loaded 2409 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/2409 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_2.pkl...
Loaded 311 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/311 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_1.pkl...
Loaded 271 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/271 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_0.pkl...
Loaded 247 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/247 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_4.pkl...
Loaded 422 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/422 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_5.pkl...
Loaded 472 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/472 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_13.pkl...
Loaded 2408 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/2408 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_10.pkl...
Loaded 1568 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/1568 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_9.pkl...
Loaded 1208 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/1208 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_17.pkl...
Loaded 12 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/12 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_8.pkl...
Loaded 847 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/847 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_18.pkl...
Loaded 15 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/15 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_6.pkl...
Loaded 495 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/495 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_15.pkl...
Loaded 3124 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/3124 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_16.pkl...
Loaded 1550 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/1550 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_3.pkl...
Loaded 366 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/366 [00:00<?, ?it/s]

Reading data from ../data/publishers/nature_filtered/data_slice_11.pkl...
Loaded 1967 papers
Concatenating titles and abstracts...
Embedding papers


  0%|          | 0/1967 [00:00<?, ?it/s]

In [17]:
new_embeddings = [
        {
            "id": row["doi"],
            "values": row["embedding"].tolist(),
        }
        for _, row in totdf.dropna(subset=["doi"]).iterrows()
    ]    

In [18]:
for batch in tqdm(batched(new_embeddings, up_batch_size), total=len(new_embeddings)//up_batch_size+1):
    response = index.upsert(
            vectors=batch, 
            namespace=pinecone_index_name, 
            _check_type=True
        )
    print(response)

  0%|          | 0/27 [00:00<?, ?it/s]

{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 800}
{'upserted_count': 125}


In [None]:
totdf.to_pickle("totdf.pkl")