In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("/workspaces/OmicsCopilot/src/server")

In [11]:
import torch
from transformers import AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from optimum.onnxruntime import ORTModelForFeatureExtraction

from pydantic import BaseModel, Field
from typing import Optional

import data.load

In [3]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('models/BAAI/bge-small-en-v1.5-onnx-O4')
ort_model = ORTModelForFeatureExtraction.from_pretrained('models/BAAI/bge-small-en-v1.5-onnx-O4', provider="CUDAExecutionProvider")

# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to("cuda")

# Compute token embeddings
model_output = ort_model(**encoded_input)

# Perform pooling. In this case, cls pooling.
sentence_embeddings = model_output[0][:, 0]

# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-3.3973e-02, -5.6586e-04,  3.1019e-02, -2.4794e-02, -1.4177e-02,
         -2.3554e-02,  3.8484e-02,  3.4079e-02,  5.4125e-02, -2.4609e-02,
         -1.7066e-02, -9.3637e-04, -1.2509e-02, -3.1339e-03,  4.5843e-02,
          2.2921e-02, -2.8289e-03, -6.9041e-03, -6.2671e-02,  1.0874e-02,
          8.1873e-02,  4.2244e-04, -1.5589e-02,  2.4227e-02, -2.8118e-02,
          5.4896e-03,  8.2625e-03, -1.1408e-02,  3.1810e-02, -5.2701e-02,
         -2.9779e-02, -1.1553e-02, -6.4227e-03,  1.5681e-02,  2.8830e-02,
          3.2654e-02, -6.8263e-02,  1.7395e-02, -8.2453e-02,  1.7053e-02,
          4.5368e-02,  2.6865e-02, -3.6795e-03,  2.8645e-02, -1.2727e-02,
         -2.6733e-02, -3.5081e-02, -3.4105e-02,  1.3175e-02, -3.1362e-02,
         -1.9585e-02, -1.2272e-02, -3.9532e-03, -6.9001e-02,  1.6568e-03,
          5.0986e-02,  4.8454e-02,  2.8381e-02,  1.8609e-02,  1.1230e-02,
          5.2595e-02, -1.1500e-02, -1.6438e-01,  5.2424e-03,  2.2789e-02,
          4.9615e

2023-11-26 13:18:32.575404481 [W:onnxruntime:, session_state.cc:1162 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-11-26 13:18:32.575419138 [W:onnxruntime:, session_state.cc:1164 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


In [4]:
sentence_embeddings.shape

torch.Size([2, 384])

In [5]:
extractor = pipeline(
    'feature-extraction', 
    model=ort_model, 
    tokenizer=tokenizer, 
    device=0,
    padding=True,
    truncation=True,
    return_tensors=True,
    framework="pt",
    batch_size=8
)

# extractor(sentences) -> need CLS pooling
extractor(sentences[0])[:, 0].numpy() == model_output[0][0, 0].cpu().numpy()

2023-11-26 13:18:41.629139453 [W:onnxruntime:, session_state.cc:1162 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2023-11-26 13:18:41.629151961 [W:onnxruntime:, session_state.cc:1164 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.


array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
      

In [20]:
examples = data.load.load_test_sentences()

class Node(BaseModel):
    name: str
    description: str
    embedding: Optional[list] = None
    author: str
    year: int

    def __getitem__(self, key):
        return getattr(self, key)

def generator():
    for x in data.load.load_test_sentences():
        yield Node(**x)

ds = KeyDataset(generator(), "description")

# nodes 

# for d in generator():
#     print(d)
#     break

In [21]:
for x in ds:
    print(x)
    break

TypeError: 'generator' object is not subscriptable

In [None]:
# TODO: stuff to look into
#     - implementing cls and mean pooling -> what is the difference?
#       - which works better?
#     - how to use the pipeline to get the embeddings
#     - figure out best way to get embeddings for a set of Node objects, 
#       - need to be able to update the Node object with the embeddings
#       - Node.upsert_embedding(db) -> store in db??