In [2]:
from huggingface_hub import login
import os

login(token=os.getenv("HF_TOKEN"))


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Endpoint inference tests

In [12]:
import requests
import os

API_URL = "https://oaomswa3nnapa9ye.us-east-1.aws.endpoints.huggingface.cloud" + "/predict"
headers = {
	"Accept" : "application/json",
	"Authorization": f"Bearer {os.getenv('HF_TOKEN')}",
	"Content-Type": "application/json" 
}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

output = query({
	"texts": ["Hello world!", "Hello world!"],
	#"image_urls": ["http://images.cocodataset.org/val2017/000000039769.jpg", "http://images.cocodataset.org/val2017/000000039769.jpg"]
    #"inputs": "Hello world!",
})

print(output)

{'text_embeds': [[0.6042993664741516, 0.17999663949012756, 0.022518541663885117, 1.1325193643569946, 0.02428610622882843, -0.24942457675933838, 0.5206234455108643, -0.25177648663520813, 0.13851392269134521, -0.36088094115257263, -0.21400709450244904, -0.2235926389694214, 0.31096553802490234, -0.1159488707780838, 0.5462026596069336, -0.3230665922164917, 0.6950499415397644, 0.3511008620262146, 0.08227723836898804, -1.156729817390442, 0.24186302721500397, 0.7083259224891663, -0.972000241279602, -0.6521224975585938, 0.43570488691329956, 0.5039578676223755, 0.3102993667125702, 0.0852641835808754, -0.11503564566373825, -0.43431100249290466, -0.44976696372032166, 0.2651408612728119, -0.07249829173088074, 0.0006896257400512695, -0.3844039738178253, -0.4310241937637329, 0.13241694867610931, -0.8978884816169739, -0.5947586297988892, 0.03405933082103729, -0.336961567401886, 0.07351523637771606, -0.4252229928970337, 0.21688489615917206, 0.5774582028388977, -0.7543176412582397, 0.720859706401825, 0

## Local siglip tests

In [3]:
from PIL import Image
import requests
from transformers import AutoProcessor, AutoModel, SiglipVisionModel
import torch

model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

image_inputs = processor(images=image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**image_inputs)

outputs.pooler_output.detach().cpu().numpy().tolist()


[[0.29939398169517517,
  -0.07202792167663574,
  -0.1986648440361023,
  0.41255509853363037,
  0.09735368192195892,
  0.1984436810016632,
  0.46492475271224976,
  -0.31624835729599,
  0.8158038854598999,
  -0.3610365092754364,
  -0.44818782806396484,
  -0.06810979545116425,
  -0.17063955962657928,
  -0.1257917284965515,
  0.39199545979499817,
  0.4342986047267914,
  -0.49896305799484253,
  -0.3607403635978699,
  0.3325474262237549,
  0.2655375599861145,
  0.4256994128227234,
  -0.24774256348609924,
  -0.4451206922531128,
  0.0223667174577713,
  0.5602694153785706,
  -0.5299941897392273,
  0.017497211694717407,
  0.1229231208562851,
  -0.26728692650794983,
  0.0016856938600540161,
  0.23768261075019836,
  -0.3837054967880249,
  0.16983851790428162,
  0.08351365476846695,
  0.1402525156736374,
  0.04013562202453613,
  -0.07839827239513397,
  -0.6842674016952515,
  -0.7755125761032104,
  -0.20998501777648926,
  -0.16587147116661072,
  0.42424100637435913,
  0.09763532876968384,
  0.006254

In [28]:
from PIL import Image
import requests
from transformers import AutoProcessor, AutoModel, SiglipVisionModel
import torch

model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

text = ["a photo of two cats", "a photo of two dogs"]  # Example text description
#text_inputs = processor(text=text, return_tensors="pt")
#image_inputs = processor(images=image, return_tensors="pt")
#inputs = {
#    "input_ids": text_inputs.input_ids,
#    "pixel_values": image_inputs.pixel_values
#}
inputs = processor(text=text, images=image, padding="max_length", return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = torch.sigmoid(logits_per_image) # these are the probabilities
probs


tensor([[2.7010e-01, 3.5706e-05]])

In [37]:
text_inputs = processor(text=text, padding="max_length", return_tensors="pt")
text_inputs = {k: v.to(model.device) for k, v in text_inputs.items()}
text_inputs["input_ids"]

tensor([[ 262,  266, 1304,  267,  395, 6473,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1],
        [ 262,  266, 1304,  267,  395, 3014,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1]])

In [24]:
print(outputs.text_embeds.shape)
print(outputs.image_embeds.shape)


torch.Size([2, 768])
torch.Size([1, 768])


In [18]:
model.vision_model(image_inputs.pixel_values)
model.text_model(text_inputs.input_ids)


BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 1.2095,  2.4445,  1.7079,  ..., -0.4254,  0.8521, -0.1561],
         [ 1.0374,  2.2973,  1.8163,  ..., -0.4372,  0.9650, -0.2871],
         [ 0.8731,  2.2582,  1.8952,  ..., -0.1756,  0.8887, -0.0719],
         ...,
         [ 1.0471,  1.9389,  1.5319,  ..., -0.2986,  0.5190,  0.1235],
         [ 1.4815,  2.4121,  1.2461,  ..., -0.3555,  0.7368,  0.3614],
         [ 1.3595,  2.3184,  1.5740,  ..., -0.2929,  0.8301,  0.3915]],

        [[ 1.7591,  1.6410,  1.3507,  ..., -0.3207,  0.4804,  0.3052],
         [ 1.6651,  1.6291,  1.3960,  ..., -0.2886,  0.5474,  0.1474],
         [ 1.5929,  1.4785,  1.5394,  ...,  0.0273,  0.5305,  0.2490],
         ...,
         [ 1.6127,  1.4535,  1.2289,  ..., -0.1894,  0.3050,  0.4433],
         [ 1.7558,  1.6480,  1.0689,  ..., -0.3349,  0.3204,  0.3695],
         [ 1.8152,  1.7158,  1.1122,  ..., -0.3980,  0.6480,  0.6822]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.8

In [12]:
inputs

{'input_ids': tensor([[ 262,  266, 1304,  267,  395, 6473,    1],
         [ 262,  266, 1304,  267,  395, 3014,    1]]),
 'pixel_values': tensor([[[[ 0.1137,  0.1686,  0.1922,  ..., -0.1922, -0.1843, -0.1922],
           [ 0.1373,  0.1686,  0.1843,  ..., -0.1922, -0.1922, -0.2078],
           [ 0.1137,  0.1529,  0.1608,  ..., -0.2392, -0.2235, -0.2078],
           ...,
           [ 0.8431,  0.7882,  0.7255,  ...,  0.7098,  0.6549,  0.6314],
           [ 0.8275,  0.7961,  0.7725,  ...,  0.6157,  0.4902,  0.4196],
           [ 0.8275,  0.7569,  0.7647,  ...,  0.0275, -0.1059, -0.2471]],
 
          [[-0.8118, -0.8118, -0.8118,  ..., -0.8902, -0.8902, -0.8980],
           [-0.7882, -0.7882, -0.7882,  ..., -0.8824, -0.8745, -0.8824],
           [-0.8196, -0.8039, -0.7882,  ..., -0.8980, -0.8902, -0.8902],
           ...,
           [-0.2627, -0.3255, -0.3725,  ..., -0.4196, -0.4510, -0.4745],
           [-0.2627, -0.2863, -0.3412,  ..., -0.4667, -0.5373, -0.5686],
           [-0.2784, -0.3

In [8]:
probs

tensor([[4.0509e-01, 5.9897e-05]])