Kernel: huggingface1

### Imports

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForCausalLM  
from PIL import Image
import requests
import copy

%matplotlib inline

In [None]:
import io
import sys
import os
import re
import json
import torch
import html
import base64
import itertools

import numpy as np
# import supervision as sv

from IPython.core.display import display, HTML #DeprecationWarning    
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    AutoModelForCausalLM,
    AutoProcessor,
    get_scheduler
)
from tqdm import tqdm
from typing import List, Dict, Any, Tuple, Generator
from peft import LoraConfig, get_peft_model
from PIL import Image

import matplotlib.pyplot as plt  

In [None]:
# To import dataset, add relevant paths to system path

dataset_path = '../dataset'
util_path = '../util'

for path_to_add in [dataset_path, util_path]:

    # Select child directory
    child_dir = os.path.abspath(path_to_add)  
    # print(f'child_dir:{child_dir}')

    # Add the child directory to sys.path  
    if child_dir not in sys.path:  
        sys.path.append(child_dir)
        print(f'child_dir added to sys.path')
    else:
        print(f'child_dir already in sys.path')

In [None]:
from sklearn.model_selection import train_test_split  
from dataset import CaptionsDataset

### Config

In [None]:
# images_base_path = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/computeinstance10-gpu/code/datasets/face_mask/images'
images_base_path = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/computeinstance10-1-gpu/code/datasets/face_mask/images'
annotations_coco_path = '../annotations/face_bbox_annotations_sample.json' # File downloaded from AzureML as COCO file

annotations_captions_jsonl_base_path = '../annotations' # Train and test files will be created at this path
task = 'more_detailed_caption'

In [None]:
model_id = 'microsoft/Florence-2-base' #<-- Testing

# model_id = 'microsoft/Florence-2-large' # 

# model_id = 'microsoft/Florence-2-large-ft'

# model_id = 'microsoft/Florence-2-base-ft' # <-- Tested
revision = None #'refs/pr/6'

print(f'model_id:{model_id}, revision:{revision}')

In [None]:
if model_id == 'microsoft/Florence-2-base-ft' or model_id == 'microsoft/Florence-2-base':
    test_size = 0.1
    BATCH_SIZE = 4
    NUM_WORKERS = 0
    EPOCHS = 50 #200 #400 #200 #100
    LR = 2e-6 #1e-6 #5e-6
elif model_id == 'microsoft/Florence-2-large':
    test_size = 0.1
    BATCH_SIZE = 2 # Samller batch size for bigger model
    NUM_WORKERS = 0
    EPOCHS = 15 #200 #400 #200 #100
    LR = 2e-6 #1e-6 #5e-6

In [None]:
device = 'cpu'

if torch.cuda.is_available():
    device='cuda'

print(f'device:{device}')

Annotations created by 'generate-annotations.ipynb' notebook

In [None]:
use_trimmed_files = True

In [None]:
if use_trimmed_files:
    # Annotations created previously
    annotations_jsonl_path_train = os.path.join(annotations_captions_jsonl_base_path, 'face_caption_annotations_sample' + '_train_trimmed' + '.jsonl')
    annotations_jsonl_path_test = os.path.join(annotations_captions_jsonl_base_path, 'face_caption_annotations_sample' + '_test_trimmed' + '.jsonl')
else:
    # Annotations created previously
    annotations_jsonl_path_train = os.path.join(annotations_captions_jsonl_base_path, 'face_caption_annotations_sample' + '_train' + '.jsonl')
    annotations_jsonl_path_test = os.path.join(annotations_captions_jsonl_base_path, 'face_caption_annotations_sample' + '_test' + '.jsonl')
    
print(f'annotations_jsonl_path_train:{annotations_jsonl_path_train}')
print(f'annotations_jsonl_path_test:{annotations_jsonl_path_test}')

### Common

In [None]:
def load_json(json_file_path):

    json_data = None

    with open(json_file_path, 'r') as file:
        json_data = json.load(file)

    return json_data 

In [None]:
def run_example(model, task_prompt, text_input=None):   
    
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    
    # MK
    # Move the Input Data to GPU
    if device == 'cuda':
        inputs = {k: v.to(device) for k, v in inputs.items()}  
         
    generated_ids = model.generate(
      input_ids=inputs["input_ids"],
      pixel_values=inputs["pixel_values"],
      max_new_tokens=1024,
      early_stopping=False,
      do_sample=False,
      num_beams=3,
    )
    
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    
    parsed_answer = processor.post_process_generation(
        generated_text, 
        task=task_prompt, 
        image_size=(image.width, image.height)
    )

    return parsed_answer

In [None]:
def plot_loss(avg_train_loss_list, avg_val_loss_list, size=(10, 5), title='Training vs Validation Loss',   
              x_label='Epochs', y_label='Loss', train_legend='Training Loss', val_legend='Validation Loss'):  
    
    # Set the size of the plot  
    plt.figure(figsize=size)  
      
    # Plot training and validation loss  
    plt.plot(avg_train_loss_list, label=train_legend)  
    plt.plot(avg_val_loss_list, label=val_legend)  
      
    # Adding title and labels  
    plt.title(title)  
    plt.xlabel(x_label)  
    plt.ylabel(y_label)  
      
    # Show legend  
    plt.legend()  
      
    # Show the plot  
    plt.show()

### Model

In [None]:
model = None
processor = None

if revision:
    # model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval().to(device)
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision = revision).to(device)
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, revision = revision)
else: # Ignore revision
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to(device)
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)  

print(f'Model loaded')

### Dataset

In [None]:
# Initiate Dataset and DataLoader for train and validation subsets

def collate_fn(batch):
    questions, answers, images = zip(*batch)
    # print(f'questions: {questions}, answers: {answers}, images:{images}')
    inputs = processor(text=list(questions), images=list(images), return_tensors="pt", padding=True).to(device)
    return inputs, answers

train_dataset = CaptionsDataset(
    jsonl_file_path = annotations_jsonl_path_train,
    image_directory_path = images_base_path
)

val_dataset = CaptionsDataset(
    jsonl_file_path = annotations_jsonl_path_test,
    image_directory_path = images_base_path
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=NUM_WORKERS, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=NUM_WORKERS)

In [None]:
# LoRA Florence-2 model

config = LoraConfig(
    r= 32, #16, #8
    lora_alpha=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "linear", "Conv2d", "lm_head", "fc2"],
    task_type="CAUSAL_LM",
    lora_dropout=0.05,
    bias="none",
    inference_mode=False,
    use_rslora=True,
    init_lora_weights="gaussian",
    revision=revision
)

peft_model = get_peft_model(model, config)
peft_model.print_trainable_parameters()

In [None]:
torch.cuda.empty_cache()

### Fine-tune

In [None]:
# training loop
def train_model(train_loader, val_loader, model, processor, epochs=10, lr=1e-6):
    avg_train_loss_list = []
    avg_val_loss_list = []
    
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_loader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    # render_inference_results(peft_model, val_loader.dataset, 6)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for inputs, answers in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
            
            # print(f'answers:{answers}')

            input_ids = inputs["input_ids"]
            pixel_values = inputs["pixel_values"]
            labels = processor.tokenizer(
                text=answers,
                return_tensors="pt",
                padding=True,
                return_token_type_ids=False
            ).input_ids.to(device)

            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

            loss.backward(), optimizer.step(), lr_scheduler.step(), optimizer.zero_grad()
            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Average Training Loss: {avg_train_loss}")
        avg_train_loss_list.append(avg_train_loss)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for inputs, answers in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}/{epochs}"):

                input_ids = inputs["input_ids"]
                pixel_values = inputs["pixel_values"]
                labels = processor.tokenizer(
                    text=answers,
                    return_tensors="pt",
                    padding=True,
                    return_token_type_ids=False
                ).input_ids.to(device)

                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
                loss = outputs.loss

                val_loss += loss.item()

            avg_val_loss = val_loss / len(val_loader)
            print(f"Average Validation Loss: {avg_val_loss}")
            avg_val_loss_list.append(avg_val_loss)

            # render_inference_results(peft_model, val_loader.dataset, 6)

    # MK
    # Save last epoch
    checkpoint_path = '../model_checkpoints/' + model_id.replace('/','_').replace('-','_').replace(' ','_') + '_' + task + '_epoch_' + str(EPOCHS)
    output_dir = checkpoint_path
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    processor.save_pretrained(output_dir)
    
    return avg_train_loss_list, avg_val_loss_list

In [None]:
%%time
avg_train_loss_list, avg_val_loss_list = train_model(train_loader, val_loader, peft_model, processor, epochs=EPOCHS, lr=LR)

plot_loss(avg_train_loss_list, avg_val_loss_list, size=(6, 3), title='Training vs Validation Loss',   
              x_label='Epochs', y_label='Loss', train_legend='Training Loss', val_legend='Validation Loss')

### Load fine-tuned model

In [None]:
checkpoint_path = '../model_checkpoints/' + model_id.replace('/','_').replace('-','_').replace(' ','_') + '_' + task + '_epoch_' + str(EPOCHS)

model_ft = AutoModelForCausalLM.from_pretrained(checkpoint_path, trust_remote_code=True, revision = revision).to(device)
processor = AutoProcessor.from_pretrained(checkpoint_path, trust_remote_code=True, revision = revision)

print(f'Loaded checkpoint_path:{checkpoint_path}')

In [None]:
use_own_image = True
image_path = os.path.join(images_base_path,'maksssksksss712.png')

print(f'use_own_image: {use_own_image}')

if not use_own_image:
    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
    image = Image.open(requests.get(url, stream=True).raw)
else:    
    image = Image.open(image_path).convert('RGB')
    
# See input image
image

In [None]:
# %%time

# task_prompt = '<MORE_DETAILED_CAPTION_CUSTOM>'

# print(f'use_own_image:{use_own_image}')

# results = run_example(model_ft,task_prompt)
# print(results)

In [None]:
%%time

task_prompt = '<MORE_DETAILED_CAPTION>'

print(f'use_own_image:{use_own_image}')

results = run_example(model_ft,task_prompt)
print(results)

DONE:
- LORA fine-tune (base-ft model) - CAPTION + {} - 100 epochs, ~6 Hrs, microsoft_Florence_2_base_ft_more_detailed_caption_epoch_100
- LORA fine-tune (large model) - CAPTION + {} - 50 epochs, ~8 Hrs, microsoft_Florence_2_large_more_detailed_caption_epoch_50
- LORA fine-tune (base-ft model) - {} only - 25 epochs, ~42 min (V100), microsoft_Florence_2_base_ft_more_detailed_caption_epoch_25
- LORA fine-tune (large model) - {} only - 15 epochs, ~60 min (V100), microsoft_Florence_2_large_more_detailed_caption_epoch_15

- LORA fine-tune (base model) - {} only - 50 epochs, ~1 Hr 23 min (V100), microsoft_Florence_2_base_more_detailed_caption_epoch_50, LoRa r = 32

TO DO (Use unique epoch count):

- LORA fine-tune (base-ft model) - {} only - New task: '<MORE_DETAILED_CAPTION_CUSTOM>'
- LORA fine-tune (large model) - {} only - New task: '<MORE_DETAILED_CAPTION_CUSTOM>'
