In [1]:
import os
import sys

sys.path.append("..")

In [2]:
from tqdm import tqdm
tqdm.pandas()
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from app import app

import glob


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#### manually configurable parameters
eb_batch_size = 500 # how many papers to embed per batch
up_batch_size = eb_batch_size # how many papers to upload per batch

In [4]:
def specter_embed(
        df: pd.DataFrame,
    ):
    """ Get a specter embedding on a dataframe of OpenAlex results """
    model = SentenceTransformer(app.config["SCI_EMB_MODEL"], device=app.config["DEVICE"])
    separation_token = model.tokenizer.sep_token
    # data is presumed to be in the openalex format
    df = df[df["abstract"] != "MISSING_ABSTRACT"].reset_index(drop=True)
    df = df[app.config["OPENALEX_PAPER_FIELDS"]]
    df["concat"] = df["title"] + separation_token + df["abstract"]
    df["embedding"] = df["concat"].progress_apply(model.embed)
    return df 

def read_data(path:str) -> pd.DataFrame: 
    """ Reads pickled dataframes from a path """
    pickle_files = glob.glob(os.path.join(path,"*.pkl"))
    print(pickle_files)
    df = pd.DataFrame()
    for pickle in tqdm(pickle_files):
        new = pd.read_pickle(pickle)
        print(f"file {pickle} has {len(new)} new elements")
        df = pd.concat([df, new])
    return df

In [5]:
df = read_data("/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered")

['/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_8.pkl', '/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_18.pkl', '/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_11.pkl', '/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_5.pkl', '/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_9.pkl', '/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_14.pkl', '/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_15.pkl', '/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_3.pkl', '/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_10.pkl', '/home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_13.pkl', '/home/marco/

  5%|▌         | 1/19 [00:01<00:22,  1.24s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_8.pkl has 8490 new elements
file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_18.pkl has 150 new elements


 16%|█▌        | 3/19 [00:04<00:23,  1.50s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_11.pkl has 19670 new elements


 21%|██        | 4/19 [00:05<00:19,  1.28s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_5.pkl has 4720 new elements


 26%|██▋       | 5/19 [00:06<00:19,  1.42s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_9.pkl has 12080 new elements


 32%|███▏      | 6/19 [00:13<00:37,  2.91s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_14.pkl has 25650 new elements


 37%|███▋      | 7/19 [00:17<00:41,  3.43s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_15.pkl has 31240 new elements


 42%|████▏     | 8/19 [00:20<00:36,  3.35s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_3.pkl has 3660 new elements


 47%|████▋     | 9/19 [00:22<00:29,  2.91s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_10.pkl has 15680 new elements


 53%|█████▎    | 10/19 [00:25<00:26,  2.89s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_13.pkl has 24080 new elements
file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_16.pkl has 15500 new elements


 63%|██████▎   | 12/19 [00:35<00:25,  3.61s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_7.pkl has 6700 new elements
file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_17.pkl has 120 new elements


 74%|███████▎  | 14/19 [00:37<00:12,  2.47s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_1.pkl has 2710 new elements


 79%|███████▉  | 15/19 [00:40<00:10,  2.73s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_0.pkl has 2470 new elements


 84%|████████▍ | 16/19 [00:42<00:06,  2.31s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_4.pkl has 4220 new elements


 89%|████████▉ | 17/19 [00:44<00:04,  2.17s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_6.pkl has 4960 new elements


 95%|█████████▍| 18/19 [00:45<00:01,  1.93s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_2.pkl has 3110 new elements


100%|██████████| 19/19 [00:52<00:00,  2.76s/it]

file /home/marco/Desktop/Coding/Oshima/refBro-main/data/publishers/nature_filtered/data_slice_12.pkl has 24090 new elements





In [6]:
model = SentenceTransformer(app.config["SCI_EMB_MODEL"], device=app.config["DEVICE"])

In [7]:
separation_token = model.tokenizer.sep_token
# data is presumed to be in the openalex format
df = df[df["abstract"] != "MISSING_ABSTRACT"].reset_index(drop=True)
df = df[["abstract",*app.config["OPENALEX_PAPER_FIELDS"].split(",")]]
df["concat"] = df["title"] + separation_token + df["abstract"]

In [14]:
pc = Pinecone(api_key=app.config["PINECONE_KEY"])
index = pc.Index(host=app.config["PINECONE_HOST"])

for i, chunk in tqdm(enumerate(np.array_split(df, len(df) // eb_batch_size))):
    print(len(chunk))
    chunk["embedding"] = chunk["concat"].progress_apply(model.encode)
    embeddings = [
            {
                "id": row["doi"],
                "values": row["embedding"].tolist(),
            }
            for _, row in chunk.iterrows()
        ]
    index._upsert_batch(embeddings, namespace="nature_filtered", _check_type=True)


  return bound(*args, **kwds)


501


100%|██████████| 501/501 [00:08<00:00, 58.60it/s]
