In [1]:
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
import torch
import torch.nn as nn
from PIL import Image
import pathlib
from typing import *
import pandas as pd
import shutil
import os

# processor = AutoProcessor.from_pretrained("fummicc1/hiyoshi-street-clip")
processor = AutoProcessor.from_pretrained("geolocal/StreetCLIP")

# model = AutoModelForZeroShotImageClassification.from_pretrained("fummicc1/hiyoshi-street-clip")
model = AutoModelForZeroShotImageClassification.from_pretrained("geolocal/StreetCLIP")

  from .autonotebook import tqdm as notebook_tqdm
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


In [2]:
model

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

In [3]:
processor

CLIPProcessor:
- image_processor: CLIPImageProcessor {
  "crop_size": {
    "height": 336,
    "width": 336
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "feature_extractor_type": "CLIPFeatureExtractor",
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "CLIPImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "CLIPProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 336
  }
}

- tokenizer: CLIPTokenizerFast(name_or_path='geolocal/StreetCLIP', vocab_size=49408, model_max_length=77, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=

In [4]:
town_name = "Minowacho"
predict_town = True
town = pathlib.Path(f"data/addrs/{town_name}")

postal_code_maps = {
    'Minowacho': '223-0051',
    'HiyoshiHoncho': '223-0062',
    'Hiyoshi': '223-0061',
}

hiyoshi_maps = {
    "1-chome": "〒223-0061 神奈川県横浜市港北区日吉1丁目",
    "2-chome": "〒223-0061 神奈川県横浜市港北区日吉2丁目",
    "3-chome": "〒223-0061 神奈川県横浜市港北区日吉3丁目",
    "4-chome": "〒223-0061 神奈川県横浜市港北区日吉4丁目",
    "5-chome": "〒223-0061 神奈川県横浜市港北区日吉5丁目",
    "6-chome": "〒223-0061 神奈川県横浜市港北区日吉6丁目",
    "7-chome": "〒223-0061 神奈川県横浜市港北区日吉7丁目",
}
label_maps = {
    'Hiyoshi': 'Hiyoshi',
    'Hiyoshihoncho': 'Hiyoshihonchol',
    'Minowacho': 'Minowacho',
}
labels = []

if predict_town:
    labels = list(label_maps.values())
else:
    pass
    # for folder in sorted(list(town.iterdir())):
    #     name = folder.name
    #     # if maps.get(name) is not None:
    #         # labels.append(maps[name])
    #     if name not in hiyoshi_maps:
    #         continue
    #     labels.append(name)
print(labels)

['Hiyoshi', 'Hiyoshihonchol', 'Minowacho']


In [5]:
def predict(labels: List[str], img: torch.Tensor):
    inputs = processor(
        text=labels,
        images=img,
        return_tensors="pt",
        padding=True
    )
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image # this is the image-text similarity score
    probs = logits_per_image.softmax(dim=1)
    print("probs:", probs)
    index = torch.argmax(probs, dim=1).item()
    return labels[index]

In [6]:

path = pathlib.Path(f"data/town/train/{town_name}")

acc = 0

paths = list(path.iterdir())[100:]

for img_path in paths:
    img = Image.open(img_path.as_posix()).convert("RGB").resize((336, 336))
    print("img:", img_path.name)
    ret = predict(labels, img=img)
    print("result", ret)
    if ret == label_maps[town_name]:
        acc += 1
acc /= len(paths)
acc

img: 3554774315-13964728586.png
probs: tensor([[0.6682, 0.3175, 0.0143]], grad_fn=<SoftmaxBackward0>)
result Hiyoshi
img: 3555130085-13964728586.png
probs: tensor([[0.5650, 0.4200, 0.0150]], grad_fn=<SoftmaxBackward0>)
result Hiyoshi
img: 3554571017-13964117957.png
probs: tensor([[0.4413, 0.5258, 0.0329]], grad_fn=<SoftmaxBackward0>)
result Hiyoshihonchol
img: 3554977612-13964474157.png
probs: tensor([[0.3981, 0.5695, 0.0324]], grad_fn=<SoftmaxBackward0>)
result Hiyoshihonchol
img: 3554418544-13964117957.png
probs: tensor([[0.2904, 0.6912, 0.0184]], grad_fn=<SoftmaxBackward0>)
result Hiyoshihonchol
img: 3554266071-13964575929.png
probs: tensor([[0.0606, 0.8991, 0.0403]], grad_fn=<SoftmaxBackward0>)
result Hiyoshihonchol
img: 3554316895-13964423271.png
probs: tensor([[0.1747, 0.7895, 0.0358]], grad_fn=<SoftmaxBackward0>)
result Hiyoshihonchol
img: 3554621841-13964525043.png
probs: tensor([[0.4582, 0.4833, 0.0585]], grad_fn=<SoftmaxBackward0>)
result Hiyoshihonchol
img: 3554520193-139646

0.0

# Minowacho

## ACCURACY

|pretrained|finetuning|
|---|---|
|0.0|0.06164383561643835|

## F1

|pretrained|finetuning|
|---|---|
|||

# Hiyoshi

## ACCURACY

|pretrained|finetuning|
|---|---|
|||

## F1

|pretrained|finetuning|
|---|---|
|||

# Hiyochihoncho

## ACCURACY

|pretrained|finetuning|
|---|---|
|||

## F1

|pretrained|finetuning|
|---|---|
|||