<a href="https://colab.research.google.com/github/MoawwazTahir/Wardrobe_proj/blob/main/cap_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install flask transformers torch pillow pyngrok autoawq qwen_vl_utils
!pip install --upgrade pyngrok



In [None]:
import os
import torch
import gc
from PIL import Image
from flask import Flask, request, jsonify
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info  # Assuming process_vision_info is available
import requests
from io import BytesIO
from pyngrok import ngrok

# Set up the Ngrok authentication token
ngrok_auth_token = "2p3l8fjNqfKKWkqEaFAgXTEn2Rj_2y1dDd6dY57UbNZ274Bw7"  # Replace with your new actual Ngrok token
ngrok.set_auth_token(ngrok_auth_token)  # Set Ngrok auth token

# Initialize Flask app
app = Flask(__name__)

# Load the model and processor
def load_model_and_processor(min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            "Qwen/Qwen2-VL-7B-Instruct-AWQ",
            torch_dtype=torch.float16,
            device_map="auto",  # You can remove this line if it conflicts with single GPU usage
        )
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None, None

    try:
        processor = AutoProcessor.from_pretrained(
            "Qwen/Qwen2-VL-7B-Instruct-AWQ", min_pixels=min_pixels, max_pixels=max_pixels
        )
        print(f"Processor loaded successfully with image resizing ({min_pixels} - {max_pixels} pixels).")
    except Exception as e:
        print(f"Error loading processor: {e}")
        return None, None, None

    model.to(device)  # Move the model to the specified device
    model.eval()
    return model, processor, device

# Load model and processor
model, processor, device = load_model_and_processor()

# Static prompt
static_prompt = (
    "Identify the most prominent outfit piece, pattern, color, material, seasonality, weather suitability, dress code, "
    "and master category. There are only four master categories: tops, bottoms, footwear, and accessories. You cannot "
    "change or add any more categories. STRICTLY CHOOSE THE MOST PROMINENT ITEM IN THE GIVEN PICTURE WITHOUT EXCEPTIONS. "
    "Outfit classification must be rigorous, with no errors in categorization. Ensure that outfit pieces fit into standard "
    "clothing categories (e.g., shirts, pants) or culturally significant attire (e.g., shalwar kameez, kimono, kurta). UNDER "
    "NO CIRCUMSTANCES SHOULD ITEMS BE MISCLASSIFIED. Dress code classification must be STRICT and aligned with the clothing's "
    "style: Casual: Reserved for everyday wear like jeans, t-shirts, or sneakers. Formal: Only classify dress shirts, "
    "blazers, dress shoes, or other distinctly formal attire. Semi-formal and business casual should not be misclassified. "
    "Accessories (e.g., watches, hats, belts) must be classified under 'accessories' only. Accessories must NEVER be "
    "classified as tops, bottoms, or footwear. Bags, belts, and other accessories should NOT be classified as outfit pieces. "
    "If an item does not match a valid outfit item or accessory, print 'NOT ALLOWED.' Each item must be classified under a "
    "master category. Master category MUST be one of the following, with NO misclassification: tops: (e.g., mastercategory: "
    "tops, outfitpiece: tshirt, pattern: striped, color: black, material: cotton, season: spring, weather: warm, dress "
    "code: casual), bottoms: (e.g., mastercategory: bottoms, outfitpiece: jeans, pattern: solid, color: blue, material: "
    "denim, season: winter, weather: cold, dress code: casual), footwear: (e.g., mastercategory: footwear, outfitpiece: "
    "sneakers, pattern: patterned, color: white, material: mesh, season: all, weather: warm, dress code: casual), "
    "accessories: (e.g., mastercategory: accessories, outfitpiece: watch, pattern: none, color: silver, material: metal, "
    "season: all, weather: all, dress code: formal). Accessories like watches, belts, or hats MUST ONLY be classified as "
    "'accessories.' Never classify these items as tops, bottoms, or footwear. Failure to do so will result in a deduction "
    "of points. Output must be in all lowercase. ENSURE THAT ALL ATTRIBUTES ARE INCLUDED IN THIS EXACT FORMAT: mastercategory: "
    "value, outfitpiece: value, pattern: value, color: value, material: value, season: value, weather: value, dress code: "
    "value. STRICTLY follow this order for all attributes, ensuring no omissions or deviations. Hyphens MUST be removed from "
    "the outfit piece (e.g., write 'tshirt' instead of 't-shirt'). NO line breaks, additional labels, or extra details "
    "should be included. Every attribute must be present, including dress code and mastercategory. If any of the criteria "
    "is not met, PRINT 'NOT ALLOWED.' Example output: mastercategory: tops, outfitpiece: tshirt, pattern: striped, color: "
    "black, material: cotton, season: spring, weather: warm, dress code: casual STRICTLY ADHERE TO THESE INSTRUCTIONS. DO "
    "NOT MISCLASSIFY ACCESSORIES OR YOU WILL LOSE POINTS. MUST BE EXTREMELY CAREFUL WITH DRESS CODE CLASSIFICATION—CASUAL OR "
    "FORMAL MUST BE STRICTLY APPLIED. MAKE SURE THE OUTPUT IS ALL LOWERCASE. NEVER misclassify items. IF THE ITEM DOES NOT "
    "MATCH A VALID OUTFIT ITEM OR AN ACCESSORY, PRINT 'NOT ALLOWED'"
)

def get_generate_method(model):
    return model.generate

def process_image_and_generate_caption(image_file):
    try:
        # Open image file
        img = Image.open(image_file.stream).convert("RGB")
        print(f"Image loaded from file: {image_file.filename}")

        # Resize image
        img = img.resize((1024, 1024))
        print(f"Image size after resizing: {img.size}")

        # Prepare image data using process_vision_info (to match the original code's intent)
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": static_prompt}
            ]
        }]

        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        # Process vision info (this should match how your original code processed images)
        image_inputs, video_inputs = process_vision_info(messages)

        # Prepare the input for the model using the processor
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to(device)

        # Check the structure of the inputs
        print(f"Inputs from processor: {inputs}")

        # Ensure inputs are valid
        if 'input_ids' not in inputs or 'attention_mask' not in inputs or 'pixel_values' not in inputs:
            raise ValueError("Invalid inputs returned by the processor.")

        # Generate caption
        generate = get_generate_method(model)
        with torch.no_grad():
            generated_ids = generate(**inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False
            )

        # Return the caption
        return output_text[0]

    except Exception as e:
        print(f"Error processing image: {e}")
        return None

@app.route('/generate_caption', methods=['POST'])
def generate_caption_endpoint():
    if 'image' not in request.files:
        return jsonify({'error': 'No image file provided'}), 400

    image_file = request.files['image']
    caption = process_image_and_generate_caption(image_file)

    if caption:
        return jsonify({'caption': caption})
    else:
        return jsonify({'error': 'Failed to generate caption'}), 500

# Set up Ngrok
try:
    public_url = ngrok.connect(5000)
    print(f" * Ngrok tunnel: {public_url}")
except Exception as e:
    print(f"Error setting up Ngrok: {e}")

if __name__ == '__main__':
    app.run()


Using device: cuda:0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/92.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

Model loaded successfully.


preprocessor_config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Processor loaded successfully with image resizing (200704 - 1003520 pixels).
 * Ngrok tunnel: NgrokTunnel: "https://0203-34-125-22-239.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


Image loaded from file: download (18).jpeg
Image size after resizing: (1024, 1024)
Inputs from processor: {'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[-6.3899e-01, -6.5359e-01, -6.8278e-01,  ..., -8.8298e-01,
         -8.9720e-01, -9.1142e-01],
        [-1.5149e+00, -1.5003e+00, -1.4419e+00,  ..., -1.1958e+00,
         -1.2243e+00, -1.2100e+00],
        [-1.1791e+00, -1.1645e+00, -1.1645e+00,  ..., -1.1532e+00,
         -1.1389e+00, -1.1105e+00],
        ...,
        [-4.0541e-01, -4.0541e-01, -4.0541e-01,  ..., -2.7151e-01,
         -2.7151e-01, -2.7151e-01],
        [-3.1782e-01, -3.7622e-01, -4.4921e-01,  ..., -1.5553e-02,
         -1.3329e-03,  1.2887e-02],
        [-4.6381e-01, -4.6381e-01, -4.7840e-01,  ..., -1.2931e-01,
         -1.2931e-01, -1.2931e-01]], device='cuda:0'), 'image_grid_thw': tensor([[ 1, 70, 70]], device='cuda

INFO:werkzeug:127.0.0.1 - - [21/Nov/2024 11:09:28] "POST /generate_caption HTTP/1.1" 200 -


Image loaded from file: download (4).jpeg
Image size after resizing: (1024, 1024)
Inputs from processor: {'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[-0.2156, -0.2302, -0.2448,  ..., -0.6839, -0.6412, -0.5844],
        [-0.4784, -0.4492, -0.4200,  ..., -0.5844, -0.5986, -0.5986],
        [-0.4930, -0.4930, -0.4930,  ..., -0.5701, -0.5275, -0.4848],
        ...,
        [ 1.2004,  1.2442,  1.2734,  ..., -0.2431, -0.2573, -0.2715],
        [ 0.5581,  0.5289,  0.4997,  ..., -0.7834, -0.7977, -0.8119],
        [ 0.1055,  0.1055,  0.1055,  ..., -0.6555, -0.6412, -0.6412]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 70, 70]], device='cuda:0')}


INFO:werkzeug:127.0.0.1 - - [21/Nov/2024 11:10:53] "POST /generate_caption HTTP/1.1" 200 -


Image loaded from file: download (14).jpeg
Image size after resizing: (1024, 1024)
Inputs from processor: {'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[1.8281, 1.8281, 1.8281,  ..., 2.0464, 2.0464, 2.0464],
        [1.8281, 1.8281, 1.8281,  ..., 2.0464, 2.0464, 2.0464],
        [1.8281, 1.8281, 1.8281,  ..., 2.0464, 2.0464, 2.0464],
        ...,
        [1.8281, 1.8281, 1.8281,  ..., 2.0464, 2.0464, 2.0464],
        [1.8281, 1.8281, 1.8281,  ..., 2.0464, 2.0464, 2.0464],
        [1.8281, 1.8281, 1.8281,  ..., 2.0464, 2.0464, 2.0464]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 70, 70]], device='cuda:0')}


INFO:werkzeug:127.0.0.1 - - [21/Nov/2024 11:12:18] "POST /generate_caption HTTP/1.1" 200 -


Image loaded from file: download.jpeg
Image size after resizing: (1024, 1024)
Inputs from processor: {'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
        [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
        [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
        ...,
        [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
        [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459],
        [1.9303, 1.9303, 1.9303,  ..., 2.1459, 2.1459, 2.1459]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 70, 70]], device='cuda:0')}


INFO:werkzeug:127.0.0.1 - - [21/Nov/2024 11:12:51] "POST /generate_caption HTTP/1.1" 200 -


Image loaded from file: download (17).jpeg
Image size after resizing: (1024, 1024)
Inputs from processor: {'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[-1.4273, -1.4273, -1.4273,  ..., -1.2669, -1.2669, -1.2669],
        [-1.4273, -1.4273, -1.4273,  ..., -1.2669, -1.2669, -1.2669],
        [-1.4273, -1.4273, -1.4273,  ..., -1.2527, -1.2527, -1.2527],
        ...,
        [-1.0915, -1.0915, -1.0915,  ..., -1.0110, -1.0110, -1.0110],
        [-1.0623, -1.0623, -1.0623,  ..., -1.0110, -1.0110, -1.0252],
        [-1.0915, -1.0915, -1.0769,  ..., -1.0110, -1.0110, -1.0110]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 70, 70]], device='cuda:0')}


INFO:werkzeug:127.0.0.1 - - [21/Nov/2024 11:14:45] "POST /generate_caption HTTP/1.1" 200 -


Image loaded from file: download (17).jpeg
Image size after resizing: (1024, 1024)
Inputs from processor: {'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[-1.4273, -1.4273, -1.4273,  ..., -1.2669, -1.2669, -1.2669],
        [-1.4273, -1.4273, -1.4273,  ..., -1.2669, -1.2669, -1.2669],
        [-1.4273, -1.4273, -1.4273,  ..., -1.2527, -1.2527, -1.2527],
        ...,
        [-1.0915, -1.0915, -1.0915,  ..., -1.0110, -1.0110, -1.0110],
        [-1.0623, -1.0623, -1.0623,  ..., -1.0110, -1.0110, -1.0252],
        [-1.0915, -1.0915, -1.0769,  ..., -1.0110, -1.0110, -1.0110]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 70, 70]], device='cuda:0')}


INFO:werkzeug:127.0.0.1 - - [21/Nov/2024 11:19:09] "POST /generate_caption HTTP/1.1" 200 -


Image loaded from file: download (8).jpeg
Image size after resizing: (1024, 1024)
Inputs from processor: {'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[1.7114, 1.7114, 1.7114,  ..., 2.0179, 2.0179, 2.0179],
        [1.7114, 1.7114, 1.7114,  ..., 2.0179, 2.0179, 2.0179],
        [1.7114, 1.7114, 1.7114,  ..., 2.0179, 2.0179, 2.0179],
        ...,
        [1.3318, 1.3318, 1.3318,  ..., 1.6909, 1.6909, 1.6909],
        [1.3464, 1.3464, 1.3464,  ..., 1.7051, 1.7051, 1.7051],
        [1.3318, 1.3318, 1.3318,  ..., 1.7051, 1.7051, 1.7051]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 70, 70]], device='cuda:0')}


INFO:werkzeug:127.0.0.1 - - [21/Nov/2024 11:19:37] "POST /generate_caption HTTP/1.1" 200 -
