In [1]:
pip install numpy==1.24.4 Pillow==10.3.0 Requests==2.31.0 torch torchvision git+https://github.com/huggingface/transformers.git accelerate qwen-vl-utils av gradio spaces

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-nmz8w3m6
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-nmz8w3m6
  Resolved https://github.com/huggingface/transformers.git to commit 4d5b45870411053c9c72d24a8e1052e00fe62ad6
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting Pillow==10.3.0
  Downloading pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting Requests==2.31.0
  Downloading requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.8-py3-none-any.whl.metadata (3.6 kB)
Collecting

In [None]:
pip install gradio spaces

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting spaces
  Downloading spaces-0.30.2-py3-none-any.whl.metadata (1.0 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-

In [None]:
import gradio as gr
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
import subprocess
import numpy as np
import os
from threading import Thread
import uuid
import io

# Model and Processor Loading (Done once at startup)
MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to("cuda").eval()
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

DESCRIPTION = "[Qwen2-VL-2B Demo](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)"

image_extensions = Image.registered_extensions()
video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")


def identify_and_save_blob(blob_path):
    """Identifies if the blob is an image or video and saves it accordingly."""
    try:
        with open(blob_path, 'rb') as file:
            blob_content = file.read()

            # Try to identify if it's an image
            try:
                Image.open(io.BytesIO(blob_content)).verify()  # Check if it's a valid image
                extension = ".png"  # Default to PNG for saving
                media_type = "image"
            except (IOError, SyntaxError):
                # If it's not a valid image, assume it's a video
                extension = ".mp4"  # Default to MP4 for saving
                media_type = "video"

            # Create a unique filename
            filename = f"temp_{uuid.uuid4()}_media{extension}"
            with open(filename, "wb") as f:
                f.write(blob_content)

            return filename, media_type

    except FileNotFoundError:
        raise ValueError(f"The file {blob_path} was not found.")
    except Exception as e:
        raise ValueError(f"An error occurred while processing the file: {e}")


@spaces.GPU
def qwen_inference(media_input, text_input=None):
    if isinstance(media_input, str):  # If it's a filepath
        media_path = media_input
        if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
            media_type = "image"
        elif media_path.endswith(video_extensions):
            media_type = "video"
        else:
            try:
                media_path, media_type = identify_and_save_blob(media_input)
                print(media_path, media_type)
            except Exception as e:
                print(e)
                raise ValueError(
                    "Unsupported media type. Please upload an image or video."
                )


    print(media_path)

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": media_type,
                    media_type: media_path,
                    **({"fps": 8.0} if media_type == "video" else {}),
                },
                {"type": "text", "text": text_input},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    streamer = TextIteratorStreamer(
        processor, skip_prompt=True, **{"skip_special_tokens": True}
    )
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer

css = """
  #output {
    height: 500px;
    overflow: auto;
    border: 1px solid #ccc;
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)

    with gr.Tab(label="Image/Video Input"):
        with gr.Row():
            with gr.Column():
                input_media = gr.File(
                    label="Upload Image or Video", type="filepath"
                )
                text_input = gr.Textbox(label="Question")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        submit_btn.click(
            qwen_inference, [input_media, text_input], [output_text]
        )

demo.launch(debug=True)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://1d74d07e9fc3dfee82.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


/tmp/gradio/66f8882b50e2690bc230b5dcafc60c0167abebed47ef115c608533f20988be4e/Screenshot 2024-09-23 001725.png
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://1d74d07e9fc3dfee82.gradio.live




In [None]:
#!pip install pyxlsb

import pandas as pd


# Load the XLSB file (replace 'your_dataset.xlsb' with your file path)
xlsb_file = pd.ExcelFile('/content/dataset.xlsb')

# Read the data into a DataFrame (replace 'Sheet1' with your actual sheet name)
df = pd.read_excel(xlsb_file, sheet_name='Sheet1')

# Save as CSV
df.to_csv('dataset.csv', index=False)

In [None]:
import pandas as pd

# Load your current CSV
df = pd.read_csv("dataset.csv")

# Assuming images are named as 1.jpg, 2.jpg, ..., and so on
base_path = '/content/images/image'  # Change this to the path where your images are stored

# Create new image paths
df['images'] = [f"{base_path}{index + 1}.png" for index in range(len(df))]

# Save the updated CSV
df.to_csv("dataset.csv", index=False)

print("CSV updated with image paths!")


CSV updated with image paths!


Fine tune using Llama Factory

In [None]:
!git clone https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
!pip install -r requirements.txt
!pip install bitsandbytes
!pip install git+https://github.com/huggingface/transformers.git
!pip install -e ".[torch, metrics]"
!pip install liger-kernel


In [None]:
!pip uninstall -y tensorflow
!pip install tensorflow-cpu


Found existing installation: tensorflow 2.17.0
Uninstalling tensorflow-2.17.0:
  Successfully uninstalled tensorflow-2.17.0
Collecting tensorflow-cpu
  Downloading tensorflow_cpu-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading tensorflow_cpu-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (221.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.2/221.2 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-cpu
Successfully installed tensorflow-cpu-2.17.0


In [None]:
import json

# Your dataset
data =      [
    {
        "image_path": "/content/images/image1.png",
        "text": "मुझे इस city बहुत पसंद है"
    },
    {
        "image_path": "/content/images/image2.png",
        "text": "The weather यहाँ बहुत अच्छा है"
    },
    {
        "image_path": "/content/images/image3.png",
        "text": "यह dish सच में amazing है"
    },
    {
        "image_path": "/content/images/image4.png",
        "text": "क्या आपने यह movie देखी"
    },
    {
        "image_path": "/content/images/image5.png",
        "text": "इस place का view breathtaking है"
    },
    {
        "image_path": "/content/images/image6.png",
        "text": "The event कल है, don't forget"
    },
    {
        "image_path": "/content/images/image7.png",
        "text": "इस road पे बहुत traffic है"
    },
    {
        "image_path": "/content/images/image8.png",
        "text": "I love this मिठाई, it's delicious"
    },
    {
        "image_path": "/content/images/image9.png",
        "text": "The food यहाँ काफी tasty था"
    }
]

# Save to a JSON file
with open('train_dataset.json', 'w') as f:
    json.dump(data, f)

In [None]:
cd ..

/content/LLaMA-Factory


In [None]:
import json

config = {
    "model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
    "do_train": True,
    "dataset": "/content/LLaMA-Factory/train_dataset.json",  # Update this path to your JSON dataset
    "template": "qwen2_vl",
    "finetuning_type": "lora",
    "lora_target": "all",
    "output_dir": "/content/qwen2vl_lora",  # Ensure this directory is writable
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "learning_rate": 5e-5,
    "num_train_epochs": 3
}

with open('/content/LLaMA-Factory/train_qwen2vl.json', 'w') as f:
    json.dump(config, f)


In [None]:
!llamafactory-cli train /content/LLaMA-Factory/train_qwen2vl.json


Traceback (most recent call last):
  File "/usr/local/bin/llamafactory-cli", line 8, in <module>
    sys.exit(main())
  File "/content/LLaMA-Factory/src/llamafactory/cli.py", line 111, in main
    run_exp()
  File "/content/LLaMA-Factory/src/llamafactory/train/tuner.py", line 45, in run_exp
    model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
  File "/content/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 162, in get_train_args
    model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
  File "/content/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 148, in _parse_train_args
    return _parse_args(parser, args)
  File "/content/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 64, in _parse_args
    return parser.parse_json_file(os.path.abspath(sys.argv[1]))
  File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 401, in parse_json_file
    outputs =

In [None]:
from PIL import Image

# Load your image
image_path = '/content/images/image3.png'  # Update this with your actual image path
image = Image.open(image_path)


In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq

# Load your fine-tuned model and processor
processor = AutoProcessor.from_pretrained("qwen2vl_lora")  # Adjust path if needed
model = AutoModelForVision2Seq.from_pretrained("qwen2vl_lora")  # Adjust path if needed

# Move model to GPU
model = model.to(device)

# Process the image
inputs = processor(images=image, return_tensors="pt").to(device)

# Run inference
outputs = model.generate(**inputs)
decoded_text = processor.decode(outputs[0], skip_special_tokens=True)

print("Extracted Text:", decoded_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: qwen2vl_lora is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`