In [1]:
from PIL import Image
from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
import pandas as pd
import time

model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
#Daten einlesen
df_aktbilder = pd.read_csv("../../data/filtered_genres_nude_painting.csv")
default_path = "../../data/zielordner_nude_painting/"

prompt_result = []
start = time.time()

for filename in list(df_aktbilder.file_name)[:3]: #[1000:2000]
    try: 
        path = default_path + filename
        image = Image.open(path)
    
    
        prompt = "<grounding> Identify the number of people in the image. Describe their individual body posture and what they are doing. Also, provide a gender assessment for each person."
        inputs = processor(text=prompt, images=image, return_tensors= "pt")
        
        generated_ids = model.generate(
            pixel_values=inputs["pixel_values"],
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            image_embeds=None,
            image_embeds_position_mask=inputs["image_embeds_position_mask"],
            use_cache=True,
            max_new_tokens=150,
        )
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        caption, entities = processor.post_process_generation(generated_text) 
        print(len(caption.split(" ")))
        
        result = [filename, prompt.replace("<grounding> ",""), caption.replace(prompt.replace("<grounding> ",""),"")] 
        prompt_result.append(result)
    except Exception as e:
        print(f"Fehler bei Datei: {filename}")

end = time.time()
elapsed = end - start
print(f"Elapsed: {elapsed} seconds")

99
75
83
Elapsed: 134.8113250732422 seconds


In [4]:
print(prompt_result)
df = pd.DataFrame(prompt_result, columns=["filename", "prompt", "answer"])
df.to_csv("/Users/int-veen/Documents/CulturalAnalytics/src/kosmos/prompt_result.csv")

[['3338-lo-scheggia-desco-da-parto-con-due-fanciulli-che-giocano.JPG!Large.JPG', 'Identify the number of people in the image. Describe their individual body posture and what they are doing. Also, provide a gender assessment for each person.', ' In the image, there are two naked boys standing next to each other, engaged in a playful activity. One boy is standing on the left side of the image and the other is on the right side. The boys are positioned in a circle, with the left boy facing the viewer and the right boy facing away. The scene is set in a dark room with a painting on the wall behind them.'], ['3341-lo-scheggia-reclining-youth-wga20987.jpg', 'Identify the number of people in the image. Describe their individual body posture and what they are doing. Also, provide a gender assessment for each person.', ' In the image, there are two people in a nude state. One person is lying down, and the other person is standing. The person lying down is holding a pillow, while the person stan

In [None]:
#from PIL import ImageDraw
#for label, sentence_pos, coordinates in entities:
#  for a,b,c,d in coordinates:
#      draw = ImageDraw.Draw(image)
#      w,h = image.size
#      draw.text((a*w+5,b*h+5),label + f" {sentence_pos}", fill="black")
#      draw.text((a*w+4,b*h+4),label + f" {sentence_pos}", fill="white")
#      draw.rectangle((a*w,b*h,c*w,d*h), outline ="red", width=3)
#      draw.rectangle((a*w,b*h,c*w,d*h), outline ="white", width=2)
#      draw.rectangle((a*w,b*h,c*w,d*h), outline ="black", width=1)
#image.show()
#print(caption)