In [1]:
!pip install git+https://github.com/huggingface/transformers
!pip install qwen-vl-utils

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-anva7ntv
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-anva7ntv
  Resolved https://github.com/huggingface/transformers to commit 343c8cb86f2ab6a51e7363ee11f69afb1c9e839e
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.46.0.dev0-py3-none-any.whl size=10025236 sha256=c3c5cda45977216d6ac836631ddeed973a5e656b22f75b4ffb6c2cb2db17bb24
  Stored in directory: /tmp/pip-ephem-wheel-cache-2ti23ept/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers
Inst

In [None]:
# Cell 1: Imports
import os
import torch
import gc
import time
from PIL import Image
import matplotlib.pyplot as plt
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import csv
import random

# Cell 2: Load Model and Processor
def load_model_and_processor(min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    try:
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            "Qwen/Qwen2-VL-2B-Instruct",
            torch_dtype=torch.bfloat16 if device.type == "cuda" else torch.float32,
            device_map="auto",
        )
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None, None
    try:
        processor = AutoProcessor.from_pretrained(
            "Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
        )
        print(f"Processor loaded successfully with image resizing ({min_pixels} - {max_pixels} pixels).")
    except Exception as e:
        print(f"Error loading processor: {e}")
        return None, None, None
    model.to(device)
    if torch.cuda.device_count() > 1:
        print(f"Let's use {torch.cuda.device_count()} GPUs!")
        model = torch.nn.DataParallel(model)
    model.eval()
    return model, processor, device

# Load the model, processor, and device
model, processor, device = load_model_and_processor()

# Cell 3: Define Helper Functions
def get_generate_method(model):
    if isinstance(model, torch.nn.DataParallel):
        return model.module.generate
    else:
        return model.generate

def process_image(image_path, prompt, processor, model, device):
    try:
        image = Image.open(image_path).convert("RGB")
        image = image.resize((1024, 1024))
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ]
        }]
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to(device)
        generate = get_generate_method(model)
        with torch.no_grad():
            generated_ids = generate(**inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False
            )
        del inputs, generated_ids, generated_ids_trimmed, image_inputs, video_inputs
        gc.collect()
        torch.cuda.empty_cache()
        return output_text[0], image
    except Exception as e:
        print(f"Error processing image: {e}")
        return None, None

# Cell 4: Main Execution
def load_valid_ids(csv_file_path):
    valid_ids = set()
    with open(csv_file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        csvreader = csv.DictReader(csvfile)
        for row in csvreader:
            valid_ids.add(row['id'])
    return valid_ids

def main():
    if model is None or processor is None:
        print("Model or processor not loaded. Exiting.")
        return

    prompt = input("Enter prompt: ")
    dataset_dir = "/kaggle/input/fashion-product-images-dataset/fashion-dataset/images/"

    if not os.path.isdir(dataset_dir):
        print(f"Directory '{dataset_dir}' does not exist. Please check the path.")
        return

    # Load valid IDs from the CSV file
    csv_file_path = "/kaggle/input/fashion-ds-filtered-datatypes/filtered_article_types.csv"
    valid_ids = load_valid_ids(csv_file_path)

    # Get all image files in the directory
    image_files = [f for f in os.listdir(dataset_dir) if f.lower().endswith(('png', 'jpg', 'jpeg'))]
    
    # Filter images based on valid IDs (assumes image filenames are formatted as <id>.extension)
    selected_images = [f for f in image_files if os.path.splitext(f)[0] in valid_ids]
    
    # Select up to 300 images
    if len(selected_images) > 300:
        selected_images = random.sample(selected_images, 300)

    image_count = 0

    # Open a CSV file to store the results
    output_csv_file_path = "/kaggle/working/image_captions.csv"
    with open(output_csv_file_path, mode='w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Image File Name', 'Master Category', 'Outfit Piece', 'Pattern', 'Color', 'Material', 'Season', 'Weather', 'Dress code', 'Gender'])

        for image_file in selected_images:
            image_path = os.path.join(dataset_dir, image_file)
            image_count += 1

            try:
                start_time = time.time()
                caption, resized_image = process_image(image_path, prompt, processor, model, device)

                if caption and resized_image:
                    elapsed_time = time.time() - start_time

                    plt.figure(figsize=(8, 8))
                    plt.imshow(resized_image)
                    plt.axis('off')
                    plt.title(f"Image {image_count}: {caption}\nProcessed in {elapsed_time:.2f} seconds")
                    plt.show()

                    print(f"Caption for image {image_count} ({image_file}): {caption}")
                    print(f"Processed in {elapsed_time:.2f} seconds\n")

                    split_caption = [part.strip() for part in caption.split(",")]
                    split_caption = [part.split(":")[1].strip() if ":" in part else "" for part in split_caption]

                    while len(split_caption) < 9:
                        split_caption.append('')

                    csvwriter.writerow([image_file] + split_caption[:9])

                else:
                    print(f"Skipping image {image_file}: Could not process.")

            except Exception as e:
                print(f"Error processing image {image_file}: {e}")

            finally:
                gc.collect()
                torch.cuda.empty_cache()

    print(f"Results saved in {output_csv_file_path}")


Using device: cuda


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

In [None]:
# Run main function
if __name__ == "__main__":
    main()