In [2]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
import ast

In [3]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

In [4]:
papers = [{'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
          {'title': 'Attention is all you need', 'abstract': ' The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'}]

In [5]:
title_abs = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in papers]
title_abs

['BERT[SEP]We introduce a new language representation model called BERT',
 'Attention is all you need[SEP] The dominant sequence transduction models are based on complex recurrent or convolutional neural networks']

In [6]:
inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)

In [7]:
result = model(**inputs)

In [8]:
embeddings = result.last_hidden_state[:, 0, :]

In [9]:
embeddings[1].shape

torch.Size([768])

In [10]:
paper_mappings = np.load('/mnt/nvme14/IGB260M/experimental/processed/paper/paper_id_index_mapping.npy', allow_pickle=True).tolist()

In [11]:
paper_ids = set(list(paper_mappings.keys()))

In [12]:
len(paper_ids)

100000

In [13]:
paper_file = '/mnt/nvme6/gnndataset/raw_dataset/Papers.txt'
paper_data_collection = dict()
with open(paper_file) as paper:
    for line in tqdm(paper, total=266988075):
        cols = line.split("\t")
        paper_title = cols[4]
        paper_id = int(cols[0])
        if paper_id in paper_ids:
            paper_data_collection[paper_id] = paper_title


100%|██████████| 266988075/266988075 [11:47<00:00, 377418.94it/s]


In [14]:
len(paper_data_collection)

100000

In [15]:
def inv_idx_2_txt(data):
    abstract_inv = data["InvertedIndex"]
    total_len = int(data["IndexLength"])
    final_text = [""] * total_len
    for key, val in abstract_inv.items():
            for idx in val:
                final_text[idx] = key
    return " ".join(final_text)

In [16]:
path = '/mnt/nvme6/gnndataset/raw_dataset/nlp'
files = [f for f in listdir(path) if (isfile(join(path, f)) and f.startswith('PaperAbstractsInvertedIndex.txt'))]

In [17]:
exception_count = 0
for i, filename in (enumerate(files)):
    with open(join(path, filename)) as infile:
        for line in tqdm(infile, total=31497025):
            cols = line.split("\t")
            paper_id = int(cols[0])
            if paper_id in paper_ids:
                data = ast.literal_eval(cols[1])
                if data != '':
                    paper_data_collection[paper_id] = paper_data_collection[paper_id] + tokenizer.sep_token + inv_idx_2_txt(data)
                else:
                    paper_data_collection[paper_id] = paper_data_collection[paper_id] + tokenizer.sep_token
print("Len: ", len(paper_data_collection))

100%|█████████▉| 31497024/31497025 [03:15<00:00, 161191.12it/s]
100%|██████████| 31497025/31497025 [03:13<00:00, 162993.95it/s]
100%|█████████▉| 31497024/31497025 [03:19<00:00, 157999.79it/s]
100%|█████████▉| 31497024/31497025 [03:14<00:00, 161603.71it/s]
100%|█████████▉| 31497024/31497025 [03:19<00:00, 158162.78it/s]

Len:  100000





In [18]:
len(paper_data_collection)

100000

In [19]:
exception_count

0

In [20]:
papaer_emb_ids = list(paper_data_collection.keys())
papaer_emb_data = list(paper_data_collection.values())

In [27]:
import torch
device = f'cuda:4' if torch.cuda.is_available() else 'cpu'

In [32]:
model = AutoModel.from_pretrained('allenai/specter').to(device)

In [40]:
paper_specter_emb = dict()
for paper_id, paper_data in tqdm(paper_data_collection.items()):

    inputs = tokenizer(paper_data, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
    result = model(**inputs)
    embedding = result.last_hidden_state[:, 0, :]
    paper_specter_emb[paper_id] = embedding.detach().cpu().squeeze()


100%|██████████| 100000/100000 [16:51<00:00, 98.91it/s]


In [42]:
idx_to_paper_mapping = dict()
for paper_id, idx in paper_mappings.items():
    idx_to_paper_mapping[idx] = paper_id

In [43]:
final_embedding_needed_to_be_saved = []
for i in range(100000):
    paper_id = idx_to_paper_mapping[i]
    final_embedding_needed_to_be_saved.append(paper_specter_emb[paper_id])

In [46]:
len(final_embedding_needed_to_be_saved)

100000

In [48]:
import numpy

In [49]:
final_embedding_needed_to_be_saved_numpy = []
for emb in final_embedding_needed_to_be_saved:
    final_embedding_needed_to_be_saved_numpy.append(emb.numpy())

In [50]:
np.save('node_feat_specter.npy', final_embedding_needed_to_be_saved_numpy)

In [66]:
from sentence_transformers import SentenceTransformer
device = f'cuda:0' if torch.cuda.is_available() else 'cpu'
MODEL = 'sentence-transformers/all-mpnet-base-v2'
model = SentenceTransformer(MODEL).to(device)

In [67]:
device

'cuda:0'

In [68]:
np.save('paper_data_collection.npy', paper_data_collection)

In [70]:
paper_768_emb = dict()
for paper_id, paper_data in tqdm(paper_data_collection.items()):
    print([paper_data])
    emb = model.encode([paper_data])
    paper_768_emb[paper_id] = emb
    break

  0%|          | 0/100000 [00:00<?, ?it/s]

['matching chelators to radiometals for radiopharmaceuticals[SEP]Radiometals comprise many useful radioactive isotopes of various metallic elements. When properly harnessed, these have valuable emission properties that can be used for diagnostic imaging techniques, such as single photon emission computed tomography (SPECT, e.g.67Ga, 99mTc, 111In, 177Lu) and positron emission tomography (PET, e.g.68Ga, 64Cu, 44Sc, 86Y, 89Zr), as well as therapeutic applications (e.g.47Sc, 114mIn, 177Lu, 90Y, 212/213Bi, 212Pb, 225Ac, 186/188Re). A fundamental critical component of a radiometal-based radiopharmaceutical is the chelator, the ligand system that binds the radiometal ion in a tight stable coordination complex so that it can be properly directed to a desirable molecular target in vivo. This article is a guide for selecting the optimal match between chelator and radiometal for use in these systems. The article briefly introduces a selection of relevant and high impact radiometals, and their pot




RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
len(embeddings)

NameError: name 'embeddings' is not defined