In [10]:
from PIL import Image
from transformers import AutoModelForCausalLM 
from transformers import AutoProcessor
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    pipeline
)
import math
from peft import LoraConfig, get_peft_model

model_id = "microsoft/Phi-3-vision-128k-instruct" 

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto", _attn_implementation='eager')

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) 

messages = [ 
    {"role": "user", "content": """ <|image_1|> Your task is to extract the information for the fields provided below. Extract the information in JSON format according to the following JSON schema:{
  "$defs": {
    "InvoiceLineItem": {
      "properties": {
        "name": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The name of the menu item",
          "title": "Name"
        },
        "net_unit_price": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The unit price before tax",
          "title": "Net Unit Price"
        },
        "unit_tax": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Tax amount per unit",
          "title": "Unit Tax"
        },
        "gross_unit_price": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Unit price including tax",
          "title": "Gross Unit Price"
        },
        "quantity": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Quantity ordered (can be decimal for weights/volumes/litres)",
          "title": "Quantity"
        },
        "net_price": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Total price before tax (quantity \u00d7 net_unit_price)",
          "title": "Net Price"
        },
        "tax_amount": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Total tax amount (quantity \u00d7 unit_tax)",
          "title": "Tax Amount"
        },
        "gross_price": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Total price including tax",
          "title": "Gross Price"
        },
        "sub_items": {
          "anyOf": [
            {
              "items": {
                "$ref": "#/$defs/InvoiceSubLineItem"
              },
              "type": "array"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Additional components or modifications",
          "identifier_field_name": "nm",
          "title": "Sub Items"
        },
        "net_sub_items_total": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Total price of all sub-items before tax",
          "title": "Net Sub Items Total"
        },
        "gross_sub_items_total": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Total price of all sub-items including tax",
          "title": "Gross Sub Items Total"
        },
        "net_total": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Combined net price of item and sub-items before discounts",
          "title": "Net Total"
        },
        "net_discounts": {
          "anyOf": [
            {
              "items": {
                "type": "string"
              },
              "type": "array"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Discounts applied to net total of this item",
          "title": "Net Discounts",
          "unordered": true
        },
        "total_tax": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Combined tax amount for item and sub-items",
          "title": "Total Tax"
        },
        "gross_discounts": {
          "anyOf": [
            {
              "items": {
                "type": "string"
              },
              "type": "array"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Discounts applied to the gross total of this item",
          "title": "Gross Discounts",
          "unordered": true
        },
        "gross_total": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "Final price including tax and after discounts",
          "title": "Gross Total"
        }
      },
      "title": "InvoiceLineItem",
      "type": "object"
    },
    "InvoiceSubLineItem": {
      "properties": {
        "name": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The name of the sub-item or modification",
          "title": "Name"
        },
        "net_unit_price": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The unit price of the sub-item before tax",
          "title": "Net Unit Price"
        },
        "unit_tax": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The tax amount per unit of the sub-item",
          "title": "Unit Tax"
        },
        "gross_unit_price": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The unit price of the sub-item including tax",
          "title": "Gross Unit Price"
        },
        "quantity": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The quantity of the sub-item (can be a decimal for items sold by weight or volume)",
          "title": "Quantity"
        },
        "net_price": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The total price of the sub-item before tax",
          "title": "Net Price"
        },
        "tax_amount": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The total tax amount for the sub-item",
          "title": "Tax Amount"
        },
        "gross_price": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "description": "The total price of the sub-item including tax",
          "title": "Gross Price"
        }
      },
      "title": "InvoiceSubLineItem",
      "type": "object"
    }
  },
  "properties": {
    "base_taxable_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The base amount that is subject to tax",
      "title": "Base Taxable Amount"
    },
    "net_discounts": {
      "anyOf": [
        {
          "items": {
            "type": "string"
          },
          "type": "array"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Discounts applied to taxable amount before tax calculation",
      "title": "Net Discounts",
      "unordered": true
    },
    "net_service_charge": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Service charge applied to taxable amount before tax calculation",
      "title": "Net Service Charge"
    },
    "taxable_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The amount that is subject to tax. This is the base amount plus net discounts and net service charges",
      "title": "Taxable Amount"
    },
    "non_taxable_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The base amount that is not subject to tax",
      "title": "Non Taxable Amount"
    },
    "net_total": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Sum of taxable and non-taxable amounts with their modifiers",
      "title": "Net Total"
    },
    "tax_rate": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Tax rate percentage applied to taxable amount",
      "title": "Tax Rate"
    },
    "tax_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Total amount of tax on the invoice",
      "title": "Tax Amount"
    },
    "base_gross_total": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The base amount that is subject to gross discounts and service charges",
      "title": "Base Gross Total"
    },
    "gross_discounts": {
      "anyOf": [
        {
          "items": {
            "type": "string"
          },
          "type": "array"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Discounts applied to entire net total after tax",
      "title": "Gross Discounts",
      "unordered": true
    },
    "gross_service_charge": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Service charge applied to entire net total after tax",
      "title": "Gross Service Charge"
    },
    "gross_total": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Final amount after all taxes and modifications",
      "title": "Gross Total"
    },
    "rounding_adjustment": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Amount added/subtracted to round to desired precision",
      "title": "Rounding Adjustment"
    },
    "commission_fee": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Commission amount deducted from total",
      "title": "Commission Fee"
    },
    "due_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The amount due for the transaction before considering prior balance",
      "title": "Due Amount"
    },
    "prior_balance": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Previous balance or credit applied to the current transaction",
      "title": "Prior Balance"
    },
    "net_due_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The final amount due after applying prior balance",
      "title": "Net Due Amount"
    },
    "paid_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The total amount paid by the customer",
      "title": "Paid Amount"
    },
    "change_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The amount returned to the customer if overpayment occurred",
      "title": "Change Amount"
    },
    "cash_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The amount paid in cash",
      "title": "Cash Amount"
    },
    "creditcard_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The amount paid by credit card",
      "title": "Creditcard Amount"
    },
    "emoney_amount": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The amount paid using electronic money",
      "title": "Emoney Amount"
    },
    "other_payments": {
      "anyOf": [
        {
          "items": {
            "type": "string"
          },
          "type": "array"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Amounts paid using other methods (e.g., coupons, vouchers)",
      "title": "Other Payments",
      "unordered": true
    },
    "menutype_count": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The number of distinct menu item types in the order",
      "title": "Menutype Count"
    },
    "menuquantity_sum": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "The total quantity of all menu items ordered",
      "title": "Menuquantity Sum"
    },
    "line_items": {
      "anyOf": [
        {
          "items": {
            "$ref": "#/$defs/InvoiceLineItem"
          },
          "type": "array"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "description": "Detailed list of individual items in the order",
      "identifier_field_name": "nm",
      "title": "Line Items"
    }
  },
  "title": "Invoice",
  "type": "object"
}"""}
]


print(model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.11s/it]


Phi3VForCausalLM(
  (model): Phi3VModel(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (vision_embed_tokens): Phi3ImageEmbedding(
      (drop): Dropout(p=0.0, inplace=False)
      (wte): Embedding(32064, 3072, padding_idx=32000)
      (img_processor): CLIPVisionModel(
        (vision_model): CLIPVisionTransformer(
          (embeddings): CLIPVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
            (position_embedding): Embedding(577, 1024)
          )
          (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (encoder): CLIPEncoder(
            (layers): ModuleList(
              (0-23): 24 x CLIPEncoderLayer(
                (self_attn): CLIPSdpaAttention(
                  (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                  (v_proj): Linear(in_features=1024, out_features=1024, bias=

In [3]:
import json
def load_json_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    json_objects = []
    for line in lines:
        obj = json.loads(line.strip())
        json_objects.append(obj)
    
    target = []
    image = []
    for label in json_objects:    
        target.append(label["target"])
        image.append(label["id"])
    
    return image, target

label_path = r'C:\Users\Admin\Documents\Internship\sroie\train-documents.jsonl'
label_val = r'C:\Users\Admin\Documents\Internship\sroie\validation-documents.jsonl'
label_test = r'C:\Users\Admin\Documents\Internship\sroie\test-documents.jsonl'
image, labels = load_json_lines(label_path)
image_val, labels_val = load_json_lines(label_val)
image_test, labels_test = load_json_lines(label_test)

In [4]:
from datasets import Dataset, DatasetDict
from PIL import Image
import os

def load_data(images, target, path):
    images_files = []
    labels = []
    i = 0
    for image in images:
        image_with_extension = f"{image}.jpg"
        file_path = os.path.join(path, image_with_extension)
        img = Image.open(file_path)
        images_files.append(img)
        string_data = json.dumps(target[i])
        labels.append(string_data)
        i+=1  
                
    return {'image': images_files, 'label': labels}

data_path = r"C:\Users\Admin\Documents\Internship\sroie\images"


data_dict = load_data(image, labels, data_path)
dict_val = load_data(image_val,labels_val, data_path)
dict_test = load_data(image_test,labels_test,data_path)


data_train = Dataset.from_dict(data_dict)
data_val = Dataset.from_dict(dict_val)
data_test = Dataset.from_dict(dict_val)

print(data_train)

Dataset({
    features: ['image', 'label'],
    num_rows: 499
})


In [5]:
def get_full_dataset(train,val,test):
    dataset = DatasetDict({
        'train': train,
        'validation': val,
        'test': test
    })
    return dataset

dataset = get_full_dataset(data_train,data_val,data_test)

In [6]:
def tokenize_function(example):
    images = example['image']  
    labels = example['label']  
    prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=prompt,
        images=images,
        return_tensors="pt", 
        padding=True,
        truncation=True
    )

    if isinstance(labels[0], str): 
        label_inputs = processor.tokenizer(
            labels,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
        labels = label_inputs["input_ids"] 
    inputs["labels"] = labels

    return inputs
tokenized_dataset = dataset.map(tokenize_function,batched=True,num_proc=1, batch_size=1,remove_columns=["image", "label"], writer_batch_size=100, load_from_cache_file=False,)

print(tokenized_dataset)

Map: 100%|██████████| 499/499 [03:30<00:00,  2.37 examples/s]
Map: 100%|██████████| 124/124 [00:37<00:00,  3.27 examples/s]
Map: 100%|██████████| 124/124 [00:35<00:00,  3.52 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'pixel_values', 'image_sizes', 'labels'],
        num_rows: 499
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'pixel_values', 'image_sizes', 'labels'],
        num_rows: 124
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'pixel_values', 'image_sizes', 'labels'],
        num_rows: 124
    })
})





In [7]:
lora_config = LoraConfig(
    r=128, 
    lora_alpha=16, 
    lora_dropout=0.1,  
    target_modules=["k_proj", "v_proj", "q_proj", "out_proj"], 
    bias = 'none'
)

In [11]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import get_peft_model

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  

trainable params: 25,165,824 || all params: 4,171,787,264 || trainable%: 0.6032


In [12]:
import wandb
from transformers import Trainer, TrainingArguments
import torch
model = model.to(torch.device("cuda"))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",              
    per_device_train_batch_size=1,      
    gradient_accumulation_steps=4,      
    num_train_epochs=3,                 
    learning_rate=3e-5,                 
    weight_decay=0.01,                  
    logging_dir="./logs",               
    logging_steps=10,                   
    save_strategy="epoch",              
    evaluation_strategy="epoch",        
    push_to_hub=False,                  
    report_to="wandb",                  
    run_name="testing"    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],       # Pre-tokenized training dataset
    eval_dataset=tokenized_dataset["validation"],   # Pre-tokenized validation dataset
)

# Start training
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlambd-22bi13234[0m ([33mlambd-22bi13234-university-of-science-and-technology-of-[0m). Use [1m`wandb login --relogin`[0m to force relogin


  attn_output = torch.nn.functional.scaled_dot_product_attention(


OutOfMemoryError: CUDA out of memory. Tried to allocate 78.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 13.96 GiB is allocated by PyTorch, and 224.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)