<a href="https://colab.research.google.com/github/MoawwazTahir/Wardrobe_proj/blob/main/qwen2_vl_2b_api_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
#!pip install flask pyngrok transformers torch Pillow requests qwen_vl_utils


In [2]:
import os
import torch
import gc
from PIL import Image
from flask import Flask, request, jsonify
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info  # Assuming process_vision_info is available
import requests
from io import BytesIO
from pyngrok import ngrok

# Set up the Ngrok authentication token
ngrok_auth_token = "2p0tNdsPkUnSJELTDbhUEuYaC0I_6GRKZSEK5Y4xJWj3RApdK"  # Replace with your actual Ngrok token
ngrok.set_auth_token(ngrok_auth_token)  # Set Ngrok auth token

# Initialize Flask app
app = Flask(__name__)

# Load the model and processor
def load_model_and_processor(min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    try:
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            "Qwen/Qwen2-VL-2B-Instruct",
            torch_dtype=torch.bfloat16 if device.type == "cuda" else torch.float32,
            device_map="auto",  # You can remove this line if it conflicts with single GPU usage
        )
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None, None

    try:
        processor = AutoProcessor.from_pretrained(
            "Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
        )
        print(f"Processor loaded successfully with image resizing ({min_pixels} - {max_pixels} pixels).")
    except Exception as e:
        print(f"Error loading processor: {e}")
        return None, None, None

    model.to(device)  # Move the model to the specified device
    model.eval()
    return model, processor, device

# Load model and processor
model, processor, device = load_model_and_processor()

Using device: cuda:0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully.
Processor loaded successfully with image resizing (200704 - 1003520 pixels).


In [3]:


def get_generate_method(model):
    return model.generate

def process_image_and_generate_caption(image_url: str):
    try:
        # Fetch image from URL
        if image_url.startswith('http'):
            response = requests.get(image_url)

            # Check if the response is an image
            if 'image' not in response.headers.get('Content-Type', ''):
                raise ValueError(f"URL does not return a valid image: {image_url}")

            img = Image.open(BytesIO(response.content)).convert("RGB")
            print(f"Image fetched from URL: {image_url}")
        else:
            img = Image.open(image_url).convert("RGB")
            print(f"Image loaded from file: {image_url}")

        # Resize image
        img = img.resize((1024, 1024))
        print(f"Image size after resizing: {img.size}")

        # Prepare image data using process_vision_info (to match the original code's intent)
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": "Describe this image."}
            ]
        }]

        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        # Process vision info (this should match how your original code processed images)
        image_inputs, video_inputs = process_vision_info(messages)

        # Prepare the input for the model using the processor
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to(device)

        # Check the structure of the inputs
        print(f"Inputs from processor: {inputs}")

        # Ensure inputs are valid
        if 'input_ids' not in inputs or 'attention_mask' not in inputs or 'pixel_values' not in inputs:
            raise ValueError("Invalid inputs returned by the processor.")

        # Generate caption
        generate = get_generate_method(model)
        with torch.no_grad():
            generated_ids = generate(**inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False
            )

        # Return the caption
        return output_text[0], img

    except Exception as e:
        print(f"Error processing image: {e}")
        return None, None

@app.route('/generate_caption', methods=['POST'])
def generate_caption():
    data = request.get_json()

    # Ensure image URL or file is provided
    if "image_url" not in data:
        return jsonify({"error": "No image URL provided"}), 400

    image_url = data["image_url"]
    caption, image = process_image_and_generate_caption(image_url)

    if caption:
        return jsonify({"caption": caption, "status": "success"})
    else:
        return jsonify({"error": "Failed to generate caption", "status": "failed"}), 500



In [None]:
# Run the Flask app with Ngrok
if __name__ == "__main__":
    # Start Ngrok tunnel for Flask app on port 5000
    public_url = ngrok.connect(5000)
    print(f"Flask app is publicly available at {public_url}")

    # Run the Flask app
    app.run(debug=False)


Flask app is publicly available at NgrokTunnel: "https://5be6-35-240-161-166.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


Image fetched from URL: https://upload.wikimedia.org/wikipedia/commons/thumb/9/92/Male_cheetah_facing_left_in_South_Africa.jpg/220px-Male_cheetah_facing_left_in_South_Africa.jpg
Image size after resizing: (1024, 1024)
Inputs from processor: {'input_ids': tensor([[151644,   8948,    198,  ..., 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[-0.6244, -0.6536, -0.6974,  ..., -1.4376, -1.2811, -1.0536],
        [ 0.2953,  0.6749,  0.9230,  ..., -1.1816, -1.2811, -1.3807],
        [-0.3470, -0.3908, -0.4784,  ..., -1.3380, -1.3522, -1.3380],
        ...,
        [ 0.7917,  0.8209,  0.8355,  ...,  0.0555,  0.0413,  0.0698],
        [ 0.9084,  0.8647,  0.8063,  ..., -0.1151, -0.1151, -0.1293],
        [ 0.2077,  0.1639,  0.1493,  ...,  0.1124,  0.1266,  0.1266]],
       device='cuda:0'), 'image_grid_thw': tensor([[ 1, 70, 70]], device='cuda:0')}


INFO:werkzeug:127.0.0.1 - - [18/Nov/2024 09:58:56] "POST /generate_caption HTTP/1.1" 200 -
