In [1]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# Load BLIP model for image captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)


def generate_caption(image_path):
    """Generate a caption for the given image."""
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        out = model.generate(**inputs)

    caption = processor.batch_decode(out, skip_special_tokens=True)[0]
    return caption


def is_explicit_scene(image_path):
    caption = generate_caption(image_path)
    explicit_keywords = [
        "bedroom",
        "kissing",
        "naked",
        "intimate",
        "erotic",
        "romantic",
        "undressed",
    ]

    for word in explicit_keywords:
        if word in caption.lower():
            return True  # Scene flagged as explicit

    return False

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
caption = generate_caption("images/8.jpg")
print("Caption:", caption)
print(is_explicit_scene("images/8.jpg"))

Caption: a naked girl sitting on the rocks by the ocean
True


In [None]:
caption = generate_caption("images/7.jpg")
print("Caption:", caption)
print(is_explicit_scene("images/7.jpg"))

Caption: a woman is laying down on a bed
False


In [None]:
# Example Usage
print(is_explicit_scene("scene.jpg"))  # True if explicit

FileNotFoundError: [Errno 2] No such file or directory: 'scene.jpg'

In [None]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained(
    "nlpconnect/vit-gpt2-image-captioning"
)
feature_extractor = ViTImageProcessor.from_pretrained(
    "nlpconnect/vit-gpt2-image-captioning"
)
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}


def predict_step(image_paths):
    images = []
    for image_path in image_paths:
        i_image = Image.open(image_path)
        if i_image.mode != "RGB":
            i_image = i_image.convert(mode="RGB")

        images.append(i_image)

    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)

    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id"

In [None]:
predict_step(
    ["images/5.jpg"]
)  # ['a woman in a hospital bed with a woman in a hospital bed']

['a man standing next to a statue of a cow']

In [None]:
from transformers import pipeline

image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

image_to_text("images/7.jpg")

# [{'generated_text': 'a soccer game with a player jumping to catch the ball '}]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id"

[{'generated_text': 'a woman is laying down on a bed '}]

In [4]:
# Load model directly
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import torch
model_id = "microsoft/Florence-2-large"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to("cuda")  # Move model to CUDA

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the image
image_path = "images/7.jpg"  # Update this with your local image path
image = Image.open(image_path).convert("RGB")

# Process the image
inputs = processor(images=image, return_tensors="pt")


In [None]:
# Generate caption
with torch.no_grad():
    output = model.generate(**inputs)

# Decode and print caption
caption = processor.batch_decode(output, skip_special_tokens=True)[0]
print("Generated Caption:", caption)


Generated Caption: woman


In [None]:
task_prompt = "<MORE_DETAILED_CAPTION>"
inputs = processor(images=image, text=task_prompt, return_tensors="pt")

with torch.no_grad():
    output = model.generate(**inputs)

caption = processor.batch_decode(output, skip_special_tokens=True)[0]
print("Detailed Caption:", caption)


Detailed Caption: The image shows a close-up of a woman lying on her back on a bed.


In [1]:
def generate_detailed_caption(image_path, task_prompt="<MORE_DETAILED_CAPTION>"):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")

    # Tokenize & Encode
    inputs = processor(text=task_prompt, images=image, return_tensors="pt")
    
    # 🔥 Move **ALL** tensors to CUDA
    inputs = {key: value.to("cuda") for key, value in inputs.items()}

    # Generate Caption with Beam Search
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,  # Allow longer captions
            early_stopping=False,
            do_sample=False,  # No randomness
            num_beams=3,  # Beam search for better results
        )

    # Decode Output
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Post-process (Ensure Processing Happens on CUDA)
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=image.size
    )

    return parsed_answer

def generate_detailed_caption_exp(image_path, task_prompt="<MORE_DETAILED_CAPTION>"):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")

    # Tokenize & Encode
    inputs = processor(text=task_prompt, images=image, return_tensors="pt")
    
    # 🔥 Move **ALL** tensors to CUDA
    inputs = {key: value.to("cuda") for key, value in inputs.items()}

    # Generate Caption with Beam Search
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=2048,  # Allow longer captions
            early_stopping=False,
            do_sample=True,  # No randomness
            num_beams=3,  # Beam search for better results
        )

    # Decode Output
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Post-process (Ensure Processing Happens on CUDA)
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=image.size
    )

    return parsed_answer

In [2]:
image_path = "images/7.jpg"  # Replace with your image file path
detailed_caption = generate_detailed_caption(image_path)
exp_detailed_caption = generate_detailed_caption_exp(image_path)
print("standard base caption")
print(detailed_caption)
print("experimental detailed caption")
print(exp_detailed_caption)

NameError: name 'Image' is not defined

In [5]:
# iterate through them all. and describe. 
for i in range(1, 9):
    print(f"images/{i}.jpg")
    print(generate_detailed_caption(f"images/{i}.jpg"))  # False if SFW

images/1.jpg
{'<MORE_DETAILED_CAPTION>': 'The image shows two women in a room with a bed in the background. The woman on the left is naked and has long red hair that is styled in two braids. She is looking at the other woman with a smile on her face. The other woman is standing in front of her and appears to be looking at her with a concerned expression. She has blonde hair that falls over her shoulders and is wearing a green blouse. The room is cluttered with clothes and other items, and there is a window on the right side of the image.'}
images/2.jpg
{'<MORE_DETAILED_CAPTION>': 'The image shows a scene from the TV show Game of Thrones. It shows a young man and a young woman sitting on a bed, facing each other. The man is holding a baby in his arms and the woman is looking at him with a concerned expression. The bed is covered with a black blanket and there is a fireplace in the background with lit candles on it. The room appears to be dimly lit and there are curtains on the windows. 

In [None]:
image_path = "images/10.jpg"  # Replace with your image file path
detailed_caption = generate_detailed_caption(image_path)
print(detailed_caption)

{'<MORE_DETAILED_CAPTION>': "The image is a close-up of a person's hand holding a watch. The background is dark and blurred, but it appears to be an outdoor setting with trees and a cloudy sky. The focus of the image is on the hand and the watch, which is partially visible in the foreground. The watch has a gold-colored face and a black strap. The image is taken from a low angle, making the watch stand out against the dark background."}


In [5]:
image_path = "images/13.jpg"  # Replace with your image file path
detailed_caption = generate_detailed_caption(image_path)
print(detailed_caption)

{'<MORE_DETAILED_CAPTION>': 'The image shows a man and a woman in a kitchen. The man is standing in front of a shelf with stacks of toilet paper and other household items on it. He is wearing a brown jacket and has a beard. The woman is sitting on the countertop next to him, wearing a blue tank top and black shorts. They are both looking at each other and appear to be engaged in a conversation. On the right side of the image, there is a sign that reads "I\'m a sweet guy."'}
