In [46]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os
import pandas as pd

# 모델버전
# 1. python : 3.11.5
# 2. pytorch : 2.2.0(cpu)
# 3. transformers : 4.32.1
# 4. Pillow : 10.0.1

# CLIP 모델과 프로세서 로드
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

# 이미지 경로와 텍스트 레이블 목록을 받아서, 이미지에 가장 적합한 텍스트 레이블을 예측합니다.
async def predict_text_from_image(path, text_labels):
    
    # 이미지 로드 및 전처리
    image = Image.open(path)

    inputs = processor(text=text_labels, images=image, return_tensors="pt", padding=True)

    # 모델을 통해 이미지와 텍스트의 유사도 계산
    outputs = clip_model(**inputs)
    logits_per_image = outputs.logits_per_image # 이미지에 대한 로짓

    # 수정 후
    probs = logits_per_image.softmax(dim=1).detach().cpu().numpy()

    # 가장 높은 확률을 가진 텍스트 레이블을 찾음
    max_index = probs.argmax()
    predicted_label = text_labels[max_index]

    return predicted_label, probs.max()

In [3]:
## 한국어
label, prob = await predict_text_from_image("구글_고양이.jpg", ['고양이', 'cat'])
print(f"라벨 : {label}")
print(f"확률 : {prob}")

라벨 : cat
확률 : 0.9729471206665039


In [4]:
## 영어
label, prob = await predict_text_from_image("google_cat.jpg", ['고양이', 'cat'])
print(f"라벨 : {label}")
print(f"확률 : {prob}")

라벨 : cat
확률 : 0.9969338178634644


In [42]:
not_best_englisth = [
    "dog","cat","chicken","pig","horse","fish","snake","magpie","cow","elephant","rabbit","tiger","frog","turtle","whale","bear","lion","sheep","duck","monkey","rat","giraffe","camel","wolf","toad","chick","pigeon","fox","goat","swallow","sparrow","penguin","noodle","candy","tree","flower","soap","gas range","humidifier","scissors","knife","potato","sweat potato","corn","ricecake","cucumber","brook","dry beach","apple","ship","car","plane","computer","notebook computer","television","banana","iron","step","stone","mountain","sea","lunch box","keyboard","mouse","plastic bag","cup","glasses","sprayer","wire","multitap","bottle","handbag","backpack","paper bag","cell phone","bowl","plate","sand","towel","bodywash","cloud","tablet","microwave","city","countryside","needle","box","person","hand","foot","clothing","footwear","hat","makeup","pen","earphone","soup","kimchi","meat","rice","bread","fried food","fan","sausage","air conditioner","dresser","sink","shower head","toilet","washing machine","chair","table","can","clock","watch","refrigerator","egg","remote control","nail clipper","hair drier","razor","doll","book","fork","spoon","chopsticks","cake","shampoo","paper towel"
]

In [43]:
english = [
    "dog","cat","chicken","pig","horse","fish","snake","magpie","cow","elephant","rabbit","tiger","frog","turtle","whale","bear","lion","sheep","duck","monkey","rat","giraffe","camel","wolf","toad","chick","pigeon","fox","goat","swallow","sparrow","penguin","noodle","candy","tree","flower","soap","gas range","humidifier","scissors","knife","potato","sweat potato","corn","ricecake","cucumber","creek","mud flats","apple","ship","car","plane","computer","labtop","tv","banana","steam iron","stair","stone","mountain","sea","lunch box","keyboard","mouse","plastic bag","cup","glasses","sprayer","wire","multitap","bottle","purse","backpack","paper bag","cell phone","bowl","plate","sand","towel","bodywash","cloud","tablet","microwave","city","countryside","needle","box","person","hand","foot","clothing","shoes","hat","cosmetics","pen","earphone","soup","kimchi","meat","rice","bread","fried food","fan","sausage","air conditioner","drawer","sink","shower head","toilet","washing machine","chair","table","can","clock","watch","refrigerator","egg","remote control","nail clipper","hair drier","razor","doll","book","fork","spoon","chopsticks","cake","shampoo","paper towel"
]

In [44]:
korean = [
    "개","고양이","닭","돼지","말","물고기","뱀","까치","소","코끼리","토끼","호랑이","개구리","거북이","고래","곰","사자","양","오리","원숭이","쥐","기린","낙타","늑대","두꺼비","병아리","비둘기","여우","염소","제비","참새","펭귄","국수","사탕","나무","꽃","비누","가스레인지","가습기","가위","칼","감자","고구마","옥수수","떡","오이","개천","갯벌","사과","배","차","비행기","컴퓨터","노트북","텔레비전","바나나","다리미","계단","돌","산","바다","도시락","키보드","마우스","비닐봉투","컵","안경","분무기","전선","멀티탭","병","손가방","가방","종이가방","핸드폰","그릇","접시","모래","수건","바디워시","구름","태블릿","전자레인지","도시","시골","바늘","박스","사람","손","발","옷","신발","모자","화장품","필기구","이어폰","국","김치","고기","밥","빵","튀김","선풍기","소시지","에어컨","서랍장","싱크대","샤워기","변기","세탁기","의자","탁자","캔","시계","손목시계","냉장고","달걀","리모컨","손톱깎이","헤어드라이어","면도기","인형","책","포크","숟가락","젓가락","케잌","샴푸"
]

In [45]:
photoofenglish = [
    "a photo of dog","a photo of cat","a photo of chicken","a photo of pig","a photo of horse","a photo of fish","a photo of snake","a photo of magpie","a photo of cow","a photo of elephant","a photo of rabbit","a photo of tiger","a photo of frog","a photo of turtle","a photo of whale","a photo of bear","a photo of lion","a photo of sheep","a photo of duck","a photo of monkey","a photo of rat","a photo of giraffe","a photo of camel","a photo of wolf","a photo of toad","a photo of chick","a photo of pigeon","a photo of fox","a photo of goat","a photo of swallow","a photo of sparrow","a photo of penguin","a photo of noodle","a photo of candy","a photo of tree","a photo of flower","a photo of soap","a photo of gas range","a photo of humidifier","a photo of scissors","a photo of knife","a photo of potato","a photo of sweat potato","a photo of corn","a photo of ricecake","a photo of cucumber","a photo of creek","a photo of mud flats","a photo of apple","a photo of ship","a photo of car","a photo of plane","a photo of computer","a photo of labtop","a photo of tv","a photo of banana","a photo of steam iron","a photo of stair","a photo of stone","a photo of mountain","a photo of sea","a photo of lunch box","a photo of keyboard","a photo of mouse","a photo of plastic bag","a photo of cup","a photo of glasses","a photo of sprayer","a photo of wire","a photo of multitap","a photo of bottle","a photo of purse","a photo of backpack","a photo of paper bag","a photo of cell phone","a photo of bowl","a photo of plate","a photo of sand","a photo of towel","a photo of bodywash","a photo of cloud","a photo of tablet","a photo of microwave","a photo of city","a photo of countryside","a photo of needle","a photo of box","a photo of person","a photo of hand","a photo of foot","a photo of clothing","a photo of shoes","a photo of hat","a photo of cosmetics","a photo of pen","a photo of earphone","a photo of soup","a photo of kimchi","a photo of meat","a photo of rice","a photo of bread","a photo of fried food","a photo of fan","a photo of sausage","a photo of air conditioner","a photo of drawer","a photo of sink","a photo of shower head","a photo of toilet","a photo of washing machine","a photo of chair","a photo of table","a photo of can","a photo of clock","a photo of watch","a photo of refrigerator","a photo of egg","a photo of remote control","a photo of nail clipper","a photo of hair drier","a photo of razor","a photo of doll","a photo of book","a photo of fork","a photo of spoon","a photo of chopsticks","a photo of cake","a photo of shampoo","a photo of paper towel"
]

In [47]:
path = os.path.join(os.getcwd(), '데이터')
eng_result = pd.DataFrame(columns=['jpg','label', 'prob'])
file_list = os.listdir(path)
file_number = len(file_list)

for idx, i in enumerate(file_list):
    label, prob = await predict_text_from_image(os.path.join(path, i), english)
    eng_result.loc[idx] = [i, label, prob]
    eng_result.to_csv('eng_result.csv', encoding='cp949')
    print(round((idx+1) / file_number,3))

0.008
0.016
0.023
0.031
0.039
0.047
0.054
0.062
0.07
0.078
0.085
0.093
0.101
0.109
0.116
0.124
0.132
0.14
0.147
0.155
0.163
0.171
0.178
0.186
0.194
0.202
0.209
0.217
0.225
0.233
0.24
0.248
0.256
0.264
0.271
0.279
0.287
0.295
0.302
0.31
0.318
0.326
0.333
0.341
0.349
0.357
0.364
0.372
0.38
0.388
0.395
0.403
0.411
0.419
0.426
0.434
0.442
0.45
0.457
0.465
0.473
0.481
0.488
0.496
0.504
0.512
0.519
0.527
0.535
0.543
0.55
0.558
0.566
0.574
0.581
0.589
0.597
0.605
0.612
0.62
0.628
0.636
0.643
0.651
0.659
0.667
0.674
0.682
0.69
0.698
0.705
0.713
0.721
0.729
0.736
0.744
0.752
0.76
0.767
0.775
0.783
0.791
0.798
0.806
0.814
0.822
0.829
0.837
0.845
0.853
0.86
0.868
0.876
0.884
0.891
0.899
0.907
0.915
0.922
0.93
0.938
0.946
0.953
0.961
0.969
0.977
0.984
0.992
1.0


In [48]:
path = os.path.join(os.getcwd(), '데이터')
kor_result = pd.DataFrame(columns=['jpg','label', 'prob'])
file_list = os.listdir(path)
file_number = len(file_list)

for idx, i in enumerate(file_list):
    label, prob = await predict_text_from_image(os.path.join(path, i), korean)
    eng_result.loc[idx] = [i, label, prob]
    eng_result.to_csv('kor_result.csv', encoding='cp949')
    print(round((idx+1) / file_number,3))

0.008
0.016
0.023
0.031
0.039
0.047
0.054
0.062
0.07
0.078
0.085
0.093
0.101
0.109
0.116
0.124
0.132
0.14
0.147
0.155
0.163
0.171
0.178
0.186
0.194
0.202
0.209
0.217
0.225
0.233
0.24
0.248
0.256
0.264
0.271
0.279
0.287
0.295
0.302
0.31
0.318
0.326
0.333
0.341
0.349
0.357
0.364
0.372
0.38
0.388
0.395
0.403
0.411
0.419
0.426
0.434
0.442
0.45
0.457
0.465
0.473
0.481
0.488
0.496
0.504
0.512
0.519
0.527
0.535
0.543
0.55
0.558
0.566
0.574
0.581
0.589
0.597
0.605
0.612
0.62
0.628
0.636
0.643
0.651
0.659
0.667
0.674
0.682
0.69
0.698
0.705
0.713
0.721
0.729
0.736
0.744
0.752
0.76
0.767
0.775
0.783
0.791
0.798
0.806
0.814
0.822
0.829
0.837
0.845
0.853
0.86
0.868
0.876
0.884
0.891
0.899
0.907
0.915
0.922
0.93
0.938
0.946
0.953
0.961
0.969
0.977
0.984
0.992
1.0


In [49]:
path = os.path.join(os.getcwd(), '데이터')
eng_result = pd.DataFrame(columns=['jpg','label', 'prob'])
file_list = os.listdir(path)
file_number = len(file_list)

for idx, i in enumerate(file_list):
    label, prob = await predict_text_from_image(os.path.join(path, i), not_best_englisth)
    eng_result.loc[idx] = [i, label, prob]
    eng_result.to_csv('not_best_eng_result.csv', encoding='cp949')
    print(round((idx+1) / file_number,3))

0.008
0.016
0.023
0.031
0.039
0.047
0.054
0.062
0.07
0.078
0.085
0.093
0.101
0.109
0.116
0.124
0.132
0.14
0.147
0.155
0.163
0.171
0.178
0.186
0.194
0.202
0.209
0.217
0.225
0.233
0.24
0.248
0.256
0.264
0.271
0.279
0.287
0.295
0.302
0.31
0.318
0.326
0.333
0.341
0.349
0.357
0.364
0.372
0.38
0.388
0.395
0.403
0.411
0.419
0.426
0.434
0.442
0.45
0.457
0.465
0.473
0.481
0.488
0.496
0.504
0.512
0.519
0.527
0.535
0.543
0.55
0.558
0.566
0.574
0.581
0.589
0.597
0.605
0.612
0.62
0.628
0.636
0.643
0.651
0.659
0.667
0.674
0.682
0.69
0.698
0.705
0.713
0.721
0.729
0.736
0.744
0.752
0.76
0.767
0.775
0.783
0.791
0.798
0.806
0.814
0.822
0.829
0.837
0.845
0.853
0.86
0.868
0.876
0.884
0.891
0.899
0.907
0.915
0.922
0.93
0.938
0.946
0.953
0.961
0.969
0.977
0.984
0.992
1.0


In [50]:
path = os.path.join(os.getcwd(), '데이터')
eng_result = pd.DataFrame(columns=['jpg','label', 'prob'])
file_list = os.listdir(path)
file_number = len(file_list)

for idx, i in enumerate(file_list):
    label, prob = await predict_text_from_image(os.path.join(path, i), photoofenglish)
    eng_result.loc[idx] = [i, label, prob]
    eng_result.to_csv('photoofenglish_eng_result.csv', encoding='cp949')
    print(round((idx+1) / file_number,3))

0.008
0.016
0.023
0.031
0.039
0.047
0.054
0.062
0.07
0.078
0.085
0.093
0.101
0.109
0.116
0.124
0.132
0.14
0.147
0.155
0.163
0.171
0.178
0.186
0.194
0.202
0.209
0.217
0.225
0.233
0.24
0.248
0.256
0.264
0.271
0.279
0.287
0.295
0.302
0.31
0.318
0.326
0.333
0.341
0.349
0.357
0.364
0.372
0.38
0.388
0.395
0.403
0.411
0.419
0.426
0.434
0.442
0.45
0.457
0.465
0.473
0.481
0.488
0.496
0.504
0.512
0.519
0.527
0.535
0.543
0.55
0.558
0.566
0.574
0.581
0.589
0.597
0.605
0.612
0.62
0.628
0.636
0.643
0.651
0.659
0.667
0.674
0.682
0.69
0.698
0.705
0.713
0.721
0.729
0.736
0.744
0.752
0.76
0.767
0.775
0.783
0.791
0.798
0.806
0.814
0.822
0.829
0.837
0.845
0.853
0.86
0.868
0.876
0.884
0.891
0.899
0.907
0.915
0.922
0.93
0.938
0.946
0.953
0.961
0.969
0.977
0.984
0.992
1.0


In [51]:
import torch
from transformers import AutoModel, AutoProcessor

# 모델버전
# 1. python : 3.11.5
# 2. pytorch : 2.2.0(cpu)
# 3. transformers : 4.32.1
# 4. Pillow : 10.0.1

# CLIP 모델과 프로세서 로드
koprocessor = AutoProcessor.from_pretrained("Bingsu/clip-vit-large-patch14-ko")
koclip_model = AutoModel.from_pretrained("Bingsu/clip-vit-large-patch14-ko")

# 이미지 경로와 텍스트 레이블 목록을 받아서, 이미지에 가장 적합한 텍스트 레이블을 예측합니다.
async def ko_predict_text_from_image(path, text_labels):
    
    # 이미지 로드 및 전처리
    image = Image.open(path)

    inputs = koprocessor(text=text_labels, images=image, return_tensors="pt", padding=True)

    # 모델을 통해 이미지와 텍스트의 유사도 계산
    with torch.inference_mode():
        outputs = koclip_model(**inputs)
    logits_per_image = outputs.logits_per_image # 이미지에 대한 로짓

    # 수정 후
    probs = logits_per_image.softmax(dim=1).detach().cpu().numpy()

    # 가장 높은 확률을 가진 텍스트 레이블을 찾음
    max_index = probs.argmax()
    predicted_label = text_labels[max_index]

    return predicted_label, probs.max()

Downloading (…)rocessor_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading tokenizer_config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/870k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/4.88k [00:00<?, ?B/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


Downloading model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

In [52]:
path = os.path.join(os.getcwd(), '데이터')
kor_result = pd.DataFrame(columns=['jpg','label', 'prob'])
file_list = os.listdir(path)
file_number = len(file_list)

for idx, i in enumerate(file_list):
    label, prob = await ko_predict_text_from_image(os.path.join(path, i), korean)
    eng_result.loc[idx] = [i, label, prob]
    eng_result.to_csv('koclip_result.csv', encoding='cp949')
    print(round((idx+1) / file_number,3))

0.008
0.016
0.023
0.031
0.039
0.047
0.054
0.062
0.07
0.078
0.085
0.093
0.101
0.109
0.116
0.124
0.132
0.14
0.147
0.155
0.163
0.171
0.178
0.186
0.194
0.202
0.209
0.217
0.225
0.233
0.24
0.248
0.256
0.264
0.271
0.279
0.287
0.295
0.302
0.31
0.318
0.326
0.333
0.341
0.349
0.357
0.364
0.372
0.38
0.388
0.395
0.403
0.411
0.419
0.426
0.434
0.442
0.45
0.457
0.465
0.473
0.481
0.488
0.496
0.504
0.512
0.519
0.527
0.535
0.543
0.55
0.558
0.566
0.574
0.581
0.589
0.597
0.605
0.612
0.62
0.628
0.636
0.643
0.651
0.659
0.667
0.674
0.682
0.69
0.698
0.705
0.713
0.721
0.729
0.736
0.744
0.752
0.76
0.767
0.775
0.783
0.791
0.798
0.806
0.814
0.822
0.829
0.837
0.845
0.853
0.86
0.868
0.876
0.884
0.891
0.899
0.907
0.915
0.922
0.93
0.938
0.946
0.953
0.961
0.969
0.977
0.984
0.992
1.0


In [53]:
label, prob = await predict_text_from_image('데이터/razor.jpg', ['shaver', 'razor'])
label

'razor'

In [54]:
prob

0.8695202