In [3]:
# Log in with Hugging Face
from huggingface_hub import HfApi

# Create an instance of the HfApi class
api = HfApi()

# Check if the token is saved

user_info = api.whoami()
if user_info:
    print(f"Logged in as: {user_info['name']}")
else:
    # Hugging Face Login
    
    # Install the huggingface_hub library if not already installed
    %pip install huggingface_hub
    %pip install ipywidgets
    # Import the login function from the huggingface_hub library
    from huggingface_hub import login
    
    # Log in to Hugging Face account
    login()

Logged in as: NFerreira98


In [None]:
%pip install --upgrade vllm
    
%pip install --upgrade mistral_common
# Install Pytorch & other libraries
%pip install "torch==2.4.0" tensorboard pillow
 
# Install Hugging Face libraries
%pip install  --upgrade \
  "transformers==4.45.1" \
  "datasets==3.0.1" \
  "accelerate==0.34.2" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.44.0" \
  "trl==0.11.1" \
  "peft==0.13.0" \
  "qwen-vl-utils"

In [None]:
# Snippet from guide which only does inference (not used in finetuning process)

from vllm import LLM
from vllm.sampling_params import SamplingParams

model_name = "mistralai/Pixtral-12B-2409"
sampling_params = SamplingParams(max_tokens=8192)

# Specify the device type (e.g., "cuda" for GPU, "cpu" for CPU)
llm = LLM(model=model_name, tokenizer_mode="mistral", max_model_len=70000, device="cpu")

prompt = "Describe this image"
image_url = "https://images.news18.com/ibnlive/uploads/2024/07/suryakumar-yadav-catch-1-2024-07-4a496281eb830a6fc7ab41e92a0d295e-3x2.jpg"

messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": image_url}}]
    },
]

In [8]:
# Data formatting
from datasets import load_dataset

# Define a system message.
system_message = "You are a helpful assistant."

# Convert dataset to OAI messages
def format_data(sample):
    return {
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Describe this image",
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": sample["url"]},
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample["text"]}],
            },
        ],
    }

# Load dataset from the hub
dataset_id = "kakaobrain/coyo-700m"
dataset = load_dataset(dataset_id, split="train[:5%]")

# Convert dataset to OAI messages
dataset = [format_data(sample) for sample in dataset]

print(dataset[0]["messages"])

Resolving data files:   0%|          | 0/128 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

[{'role': 'system', 'content': [{'type': 'text', 'text': 'You are a helpful assistant.'}]}, {'role': 'user', 'content': [{'type': 'text', 'text': 'Describe this image'}, {'type': 'image_url', 'image_url': {'url': 'https://cdn.shopify.com/s/files/1/0286/3900/2698/products/TVN_Huile-olive-infuse-et-s-227x300_e9a90ffd-b6d2-4118-95a1-29a5c7a05a49_800x.jpg?v=1616684087'}}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': 'Olive oil infused with Tuscany herbs'}]}]


In [9]:
import random
from datasets import load_dataset
from huggingface_hub import HfFolder
from transformers import AutoProcessor, TrainingArguments, Trainer, AutoModelForImageTextToText
from PIL import Image, UnidentifiedImageError
import requests
from io import BytesIO

# Get the saved token
token = HfFolder.get_token()

# Load the dataset in streaming mode
ds = load_dataset("kakaobrain/coyo-700m", split='train[:5%]', token=token, cache_dir="./dataset_cache")

# Reservoir sampling to get a random 5% sample
def reservoir_sampling(stream, sample_size):
    sample = []
    for i, item in enumerate(stream):
        if i < sample_size:
            sample.append(item)
        else:
            j = random.randint(0, i)
            if j < sample_size:
                sample[j] = item
    return sample

# Calculate the sample size (5% of the dataset)
total_size = 37348613  # Total number of samples in the dataset
sample_size = int(0.05 * total_size)

# Get a random 5% sample using reservoir sampling
ds_sample = reservoir_sampling(ds, sample_size)

# Verify the size of the sampled dataset
print(f"Number of samples in the dataset: {len(ds_sample)}")

# Load the processor using Unsloth Oixtr
processor = AutoProcessor.from_pretrained("unsloth/Pixtral-12B-2409-unsloth-bnb-4bit")

# Function to load an image from a URL with enhanced error handling
def load_image_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        img = Image.open(BytesIO(response.content))
        img.verify()  # Verify that it is, in fact, an image
        return img
    except (requests.exceptions.RequestException, UnidentifiedImageError) as e:
        print(f"Error loading image from {url}: {e}")
        return None  # Return None if the image could not be loaded

# Define the preprocessing function
def preprocess_function(examples):
    inputs = [ex["text"] for ex in examples]
    image_urls = [ex["url"] for ex in examples]
    images = [load_image_from_url(url) for url in image_urls]
    images = [img for img in images if img is not None]  # Filter out None values
    model_inputs = processor(text=inputs, images=images, return_tensors="pt", padding=True, truncation=True)
    return model_inputs

# Apply the preprocessing function to the sampled dataset
tokenized_datasets = [preprocess_function(ds_sample)]

# Load the model
model = AutoModelForImageTextToText.from_pretrained("unsloth/Pixtral-12B-2409-unsloth-bnb-4bit")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-pixtral")
processor.save_pretrained("./fine-tuned-pixtral")

Resolving data files:   0%|          | 0/128 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

KeyboardInterrupt: 