In [None]:
import warnings
warnings.filterwarnings("ignore")
import torch
import os
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig
import json
from PIL import Image

# Load the model from the local directory (4-bit quantized)
model_path = "/home/jj/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = LlavaForConditionalGeneration.from_pretrained(model_path,
                                                      quantization_config=quantization_config,
                                                      torch_dtype=torch.float16)

# Define a chat template and set the tokenizer and processor
LLAVA_CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_path)
processor.tokenizer = tokenizer

# Data collator for processing text and image pairs
class LLavaDataCollator:
    def __init__(self, processor, img_dir):
        self.processor = processor
        self.img_dir = img_dir

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            # Prepare the conversation as a template
            messages = [
                {"role": "user", "content": [{"type": "text", "text": "What is the clothing style in this image?"}, {"type": "image"}]},
                {"role": "assistant", "content": [{"type": "text", "text": example["response"]}]}
            ]
            text = self.processor.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=False
            )
            texts.append(text)
            
            # Load and process the image
            full_image_path = os.path.join(self.img_dir, os.path.basename(example["image_path"]))
            image = Image.open(full_image_path).convert('RGB')
            images.append(image)

        # Process the batch
        inputs = self.processor(text=texts, images=images, return_tensors="pt", padding=True)

        # Prepare the labels for supervised fine-tuning
        labels = inputs["input_ids"].clone()
        if self.processor.tokenizer.pad_token_id is not None:
            labels[labels == self.processor.tokenizer.pad_token_id] = -100
        inputs["labels"] = labels

        return inputs

img_dir = "./llava/img"
data_collator = LLavaDataCollator(processor, img_dir)

# Function to load JSONL dataset
def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

train_dataset = load_jsonl("./llava/clothing_style_dataset.jsonl")

# Set training arguments
training_args = TrainingArguments(
    output_dir="llava-clothing-style-identification",
    learning_rate=1.4e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    logging_steps=5,
    num_train_epochs=100,
    push_to_hub=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    fp16=True,
    bf16=False
)

# LoRA configuration for efficient fine-tuning
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules="all-linear"  # Applies LoRA to all linear layers
)

# Create the SFTTrainer for supervised fine-tuning
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    peft_config=lora_config,
    dataset_text_field="text",  # Dummy field as required by SFTTrainer
    tokenizer=tokenizer,
    data_collator=data_collator,
    dataset_kwargs={"skip_prepare_dataset": True},
)

# Start the training process
trainer.train()

# Save the fine-tuned model locally
trainer.save_model("./fine_tuned_llava")
print("Fine-tuned model saved to ./fine_tuned_llava")


In [None]:
import torch
from transformers import AutoTokenizer, AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
from peft import PeftModel
from PIL import Image

# Paths to the base model, fine-tuned model, and image
base_model_path = "./llava"
fine_tuned_adapter_path = "./fine_tuned_llava"
image_path = "./llava/img/1.jpg"

# Load the base (untrained) model WITHOUT 4-bit quantization
base_model = LlavaForConditionalGeneration.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,  # Using float16 for GPU efficiency
    device_map="auto"
)
base_model.eval()  # Set to evaluation mode

# Load the fine-tuned model WITH 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
fine_tuned_model = LlavaForConditionalGeneration.from_pretrained(
    base_model_path,
    quantization_config=quantization_config,
    torch_dtype=torch.float16,  # Using float16 with 4-bit quantization for memory efficiency
    device_map="auto"
)

# Load the fine-tuned adapter into the quantized model
fine_tuned_model = PeftModel.from_pretrained(fine_tuned_model, fine_tuned_adapter_path)
fine_tuned_model.eval()  # Set to evaluation mode

# Load tokenizer and processor for the model
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
processor = AutoProcessor.from_pretrained(base_model_path)

# Function to generate response from the model based on the image and prompt
def generate_response(model, image_path, prompt="What is the clothing style in this image?"):
    # Load and process the image
    image = Image.open(image_path).convert('RGB')
    
    # Prepare input with the prompt and image placeholder
    prompt_with_image = f"{prompt}\n<image>"
    inputs = processor(text=prompt_with_image, images=image, return_tensors="pt").to('cuda')  # Send input to GPU
    
    # Generate the response without computing gradients (no training)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )

    # Decode the generated response from the model
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Clean up the response by removing the prompt and assistant marker
    response = response.split(prompt)[-1].strip()
    response = response.replace("ASSISTANT:", "").strip()

    # Remove any common end tokens from the response
    end_tokens = ["</s>", "<|endoftext|>", "<|end|>"]
    for token in end_tokens:
        response = response.split(token)[0].strip()
    
    return response

# Example prompt for generating responses from both models
prompt = "What is the clothing style in this image?"



In [None]:
# Original base model response (float16, no quantization)
base_model_response = generate_response(base_model, image_path, prompt)
print(f"Base Model Response: {base_model_response}")

In [None]:
# Fine-tuned model response (4-bit quantized)
fine_tuned_model_response = generate_response(fine_tuned_model, image_path, prompt)
print(f"Fine-tuned Model Response: {fine_tuned_model_response}")


In [None]:
from PIL import Image
import matplotlib.pyplot as plt

image_path = "./llava/img/1.jpg"

def display_image(image_path):
    # Open the image
    img = Image.open(image_path)
    
    # Set up the display with matplotlib
    plt.figure(figsize=(5, 5))
    plt.imshow(img)
    plt.axis('off')  # Hide axis
    
    # Show the image
    plt.show()

# Call the function to display the image
display_image(image_path)
