## Libraries

In [3]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel, AutoModelForImageClassification, ViTModel
from transformers import AutoTokenizer, AutoModelForCausalLM, DonutProcessor
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import Trainer, TrainingArguments
from PIL import Image
import numpy as np
from torchvision import transforms
from datasets import load_dataset
from unids import DonutDataset
from torch.utils.data import DataLoader
import torch.optim as optim
from peft import get_peft_model, LoraConfig, TaskType
from peft import prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import os

## Paths

In [4]:
base_dir = '/'.join(os.getcwd().split('/')[:3])

vit_path = f'{base_dir}/work/models/vit'
llama_path = f'{base_dir}/work/models/llama2'
mistral_path = f'{base_dir}/work/models/mistral'
gemma2_path = f'{base_dir}/work/models/gemma2'
unichart_path = f'{base_dir}/work/models/unichart/Encoder'
ds_path = f'{base_dir}/uniptds'
gemma22_path = f'{base_dir}/work/models/gemma2-2B'

image_folder = f'{base_dir}/content/tmp'

## Setting Device

In [3]:
device = "cuda:0"

In [4]:
!export TOKENIZERS_PARALLELISM=false

## Custom models

In [5]:
class test_processor():
    def __init__(self, vision_processor, text_tokenizer):
        self.processor = vision_processor
        self.tokenizer = text_tokenizer

    def __call__(self, image):
        image_tensor = vision_transform(image).unsqueeze(0)
        return image_tensor

In [6]:
class AlignmentMLP(nn.Module):
    def __init__(self, vision_dim, text_dim, hidden_dim):
        super(AlignmentMLP, self).__init__()
        self.fc1 = nn.Linear(vision_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, text_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [7]:
class MultimodalModel(nn.Module):
    def __init__(self, vision_model, alignment_mlp, text_model):
        super(MultimodalModel, self).__init__()
        self.vision_model = vision_model
        self.alignment_mlp = alignment_mlp
        self.text_model = text_model

    def forward(self, pixel_values, input_ids):
        tmp = self.vision_model(pixel_values)
        vision_outputs = tmp.last_hidden_state
        print(vision_outputs.shape, " !! VIT ")
        aligned_features = self.alignment_mlp(vision_outputs)

        text_embeddings = self.text_model.get_input_embeddings()(input_ids)
        print(aligned_features.shape, " @@ aligned ")
        print(text_embeddings.shape, " ## embeddings ")
        combined_features = torch.cat((aligned_features, text_embeddings), dim=1)

        outputs = self.text_model(inputs_embeds=combined_features)

        return outputs.logits
        
        # print(pixel_values.shape, " #@#@#! ")
        # tmp = self.vision_model(pixel_values)
        # vision_outputs = tmp.last_hidden_state[0]
        # print(vision_outputs.shape, " ^^^^^ ")
        # aligned_features = self.alignment_mlp(vision_outputs)
        # print(aligned_features.shape, " nomnomnom ")
        # text_embeddings = self.text_model.get_input_embeddings()(input_ids)
        # combined_features = torch.cat((aligned_features.unsqueeze(0), text_embeddings), dim=1)

        # outputs = self.text_model(inputs_embeds=combined_features)
        # ######## INCLUDE IF USING TRAINER #############
        # # logits = outputs.logits[:,577:]
        # # logits = logits.view(-1, 32000)  # Shape: [1024, 128256]
        # # labels = labels.view(-1)
        # # criterion = nn.CrossEntropyLoss()
        # # loss = criterion(logits, labels)
        # # #return {"logits": outputs.logits, "loss": loss}
        # # return loss
        # #####################################################

        # return {"logits": outputs.logits}

## Loading Models

In [8]:
!sq

            JOBID     USER              ACCOUNT           NAME  ST  TIME_LEFT NODES CPUS TRES_PER_N MIN_MEM NODELIST (REASON) 
         23837177    msm97       ctb-enamul_gpu    interactive   R    4:14:57     1   32 gres:gpu:a    125G gra1361 (None) 


In [9]:
vision_model = ViTModel.from_pretrained(vit_path)
vision_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

Some weights of ViTModel were not initialized from the model checkpoint at /local/msm97.23837177.0/work/models/vit and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# 1024 for LLama, 2048 for gemma
alignment_mlp = AlignmentMLP(vision_dim=768, text_dim=2304, hidden_dim=4096)

In [11]:
# text_tokenizer = AutoTokenizer.from_pretrained(llama_path)
# text_model = LlamaForCausalLM.from_pretrained(llama_path)
text_tokenizer = AutoTokenizer.from_pretrained(gemma22_path)
text_model = AutoModelForCausalLM.from_pretrained(gemma22_path)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
text_tokenizer.pad_token

'<pad>'

In [13]:
text_tokenizer.eos_token

'<eos>'

In [14]:
text_tokenizer.pad_token = text_tokenizer.eos_token
text_tokenizer.padding_side = "right"

In [15]:
processor = test_processor(vision_transform, text_tokenizer)

In [16]:
multimodal_model = MultimodalModel(vision_model, alignment_mlp, text_model)
multimodal_model.to(device)

MultimodalModel(
  (vision_model): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

In [17]:
def freeze_model_weights(model):
    for param in model.parameters():
        param.requires_grad = False


In [18]:
def print_model_summary(model):
    print(f"{'Layer':<20} {'Output Shape':<25} {'Param #':<15}")
    print("=" * 60)
    total_params = 0
    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue
        param_shape = list(param.shape)
        param_count = param.numel()
        total_params += param_count
        print(f"{name:<20} {str(param_shape):<25} {param_count:<15}")
    print("=" * 60)
    print(f"Total Trainable Params: {total_params}")

# Print the model summary
print_model_summary(multimodal_model)

Layer                Output Shape              Param #        
vision_model.embeddings.cls_token [1, 1, 768]               768            
vision_model.embeddings.position_embeddings [1, 577, 768]             443136         
vision_model.embeddings.patch_embeddings.projection.weight [768, 3, 16, 16]          589824         
vision_model.embeddings.patch_embeddings.projection.bias [768]                     768            
vision_model.encoder.layer.0.attention.attention.query.weight [768, 768]                589824         
vision_model.encoder.layer.0.attention.attention.query.bias [768]                     768            
vision_model.encoder.layer.0.attention.attention.key.weight [768, 768]                589824         
vision_model.encoder.layer.0.attention.attention.key.bias [768]                     768            
vision_model.encoder.layer.0.attention.attention.value.weight [768, 768]                589824         
vision_model.encoder.layer.0.attention.attention.value.bias [76

In [19]:
freeze_model_weights(multimodal_model.vision_model)
#freeze_model_weights(multimodal_model.text_model)

In [20]:
for name, param in multimodal_model.named_parameters():
    print(f"{name}: {param.requires_grad}")

vision_model.embeddings.cls_token: False
vision_model.embeddings.position_embeddings: False
vision_model.embeddings.patch_embeddings.projection.weight: False
vision_model.embeddings.patch_embeddings.projection.bias: False
vision_model.encoder.layer.0.attention.attention.query.weight: False
vision_model.encoder.layer.0.attention.attention.query.bias: False
vision_model.encoder.layer.0.attention.attention.key.weight: False
vision_model.encoder.layer.0.attention.attention.key.bias: False
vision_model.encoder.layer.0.attention.attention.value.weight: False
vision_model.encoder.layer.0.attention.attention.value.bias: False
vision_model.encoder.layer.0.attention.output.dense.weight: False
vision_model.encoder.layer.0.attention.output.dense.bias: False
vision_model.encoder.layer.0.intermediate.dense.weight: False
vision_model.encoder.layer.0.intermediate.dense.bias: False
vision_model.encoder.layer.0.output.dense.weight: False
vision_model.encoder.layer.0.output.dense.bias: False
vision_model

In [21]:
!nvidia-smi

Sun Aug  4 13:23:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   42C    P0              70W / 300W |  10959MiB / 81920MiB |     13%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:65:00.0 Off |  

In [22]:
# Read the memory info from /proc/meminfo
with open('/proc/meminfo', 'r') as meminfo:
    lines = meminfo.readlines()

# Extract total and available memory in kB
mem_total = int([x for x in lines if 'MemTotal:' in x][0].split()[1])
mem_available = int([x for x in lines if 'MemAvailable:' in x][0].split()[1])

print(f"Total Memory: {mem_total / 1024:.2f} MB")
print(f"Available Memory: {mem_available / 1024:.2f} MB")


Total Memory: 257419.61 MB
Available Memory: 244247.91 MB


## Loading Data

In [23]:
# Summarization Start Index: 2768876
# Summarization End Index: 3222856 (Inclusive)

# Chart2table Start Index: 5637216

#dataset size: 6898333

In [6]:
huh_tr = DonutDataset(ds_path, image_folder, 1024, processor, split = 'train', prompt_end_token = '<s_answer>', indices = list(range(len(6898333))))
#huh_ev = DonutDataset(ds_path, image_folder, 1024, processor, prompt_end_token = '<s_answer>', indices = list(range(16,20)))

NameError: name 'processor' is not defined

In [25]:
dataloader = DataLoader(huh_tr, batch_size=2, shuffle=True, num_workers=4)

In [26]:
traindl, evaldl = DataLoader(huh_tr, batch_size=1, shuffle=True), DataLoader(huh_ev, batch_size=2, shuffle=True)

## Hyperparameters

In [27]:
num_epochs = 1

In [28]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(multimodal_model.parameters(), lr=0.001)

In [29]:
!nvidia-smi

Sun Aug  4 13:23:24 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   42C    P0              70W / 300W |  10959MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:65:00.0 Off |  

In [30]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=5,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     warmup_steps=0,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=1,
#     evaluation_strategy="epoch",
#     # save_strategy="epoch",
#     # load_best_model_at_end=True,
#     # save_total_limit=3,
#     # # Use multiple GPUs
#     dataloader_num_workers=4,
#     fp16=True,  # Enable 16-bit precision training if applicable
#     # report_to="none",
#     gradient_accumulation_steps=2,
# )
# for i in range(torch.cuda.device_count()):
#     print(f"GPU {i}:")
#     print(f"  Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
#     print(f"  Reserved: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
#     print(f"  Total: {torch.cuda.get_device_properties(i).total_memory / 1024**2:.2f} MB")

# trainer = Trainer(
#     model=multimodal_model,
#     args=training_args,
#     train_dataset=huh_tr,
#     eval_dataset=huh_ev,
# )

# # Start training
# trainer.train()

In [33]:
for epoch in range(num_epochs):
    # Training phase
    multimodal_model.train()
    running_loss = 0.0
    print(f" ============ Epoch {epoch+1} of {num_epochs} ========== ")
    for batch in dataloader:
        #image_input, text_ids, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        pixel_values, input_ids, labels = batch['pixel_values'].to(device), batch['input_ids'].to(device), batch['labels'].to(device)
        #print(text_ids.shape, " !!! ", image_input.shape, " ^^^ ", labels.shape)
        optimizer.zero_grad()
        # Forward pass
        outputs = multimodal_model(pixel_values,input_ids)
        print(outputs.shape, " !!! ")
        #loss = criterion(outputs, labels)

        logits = outputs[:,577:]
        logits = logits.contiguous().view(-1, logits.size(-1))  # Shape: [1024, 128256]
        labels = labels.contiguous().view(-1)
        print(logits.shape, " !!! ", labels.shape)
        loss = 0
        for enumeraterow in 
        loss = criterion(logits, labels)
        print("Loss: ", loss, " !! ")        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate loss
        running_loss += loss.item()
    
    train_loss = running_loss / len(dataloader)
    
    multimodal_model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in evaldl:
            #image_input, text_ids, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)

            # logits = self.forward(pixel_values, input_ids)
            # logits = logits[:, 577:]  # logits shape: [4, 1024, 32000]
            # logits = logits.contiguous().view(-1, logits.size(-1))  # Reshape to [4 * 1024, 32000]        
            # labels = labels.contiguous().view(-1)  # Reshape to [4 * 1024]
            # #loss = criterion(logits, labels)      
            # loss = self.criterion(logits, labels)
        
            pixel_values, input_ids, labels = batch['pixel_values'].to(device), batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = multimodal_model(pixel_values,input_ids)
            #loss = criterion(outputs['logits'], labels)
            logits = outputs[:,577:]
            logits = logits.contiguous().view(-1, logits.size(-1))  # Shape: [1024, 128256]
            labels = labels.contiguous().view(-1)
            loss = criterion(logits, labels)
            print("Validation Loss: ", loss, " !! ")  
            val_loss += loss.item()
            
            # # Get the predicted class
            # _, predicted = torch.max(outputs, 1)
            # total += labels.size(0)
            # correct += (predicted == labels).sum().item()
    
    val_loss = val_loss / len(evaldl)
    # val_accuracy = correct / total



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


torch.Size([2, 577, 768])  !! VIT 
torch.Size([2, 577, 2304])  @@ aligned 
torch.Size([2, 1024, 2304])  ## embeddings 


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


torch.Size([2, 1601, 256000])  !!! 
torch.Size([2048, 256000])  !!!  torch.Size([2048])
Loss:  tensor(2674.6038, device='cuda:0', grad_fn=<AddBackward0>)  !! 
torch.Size([2, 577, 768])  !! VIT 
torch.Size([2, 577, 2304])  @@ aligned 
torch.Size([2, 1024, 2304])  ## embeddings 
torch.Size([2, 1601, 256000])  !!! 
torch.Size([2048, 256000])  !!!  torch.Size([2048])
Loss:  tensor(2536.2090, device='cuda:0', grad_fn=<AddBackward0>)  !! 
torch.Size([2, 577, 768])  !! VIT 
torch.Size([2, 577, 2304])  @@ aligned 
torch.Size([2, 1024, 2304])  ## embeddings 
torch.Size([2, 1601, 256000])  !!! 
torch.Size([2048, 256000])  !!!  torch.Size([2048])
Loss:  tensor(2366.6509, device='cuda:0', grad_fn=<AddBackward0>)  !! 
torch.Size([2, 577, 768])  !! VIT 
torch.Size([2, 577, 2304])  @@ aligned 
torch.Size([2, 1024, 2304])  ## embeddings 
torch.Size([2, 1601, 256000])  !!! 
torch.Size([2048, 256000])  !!!  torch.Size([2048])
Loss:  tensor(2263.1768, device='cuda:0', grad_fn=<AddBackward0>)  !! 
torch.S

# Test

In [None]:
# Read the memory info from /proc/meminfo
with open('/proc/meminfo', 'r') as meminfo:
    lines = meminfo.readlines()

# Extract total and available memory in kB
mem_total = int([x for x in lines if 'MemTotal:' in x][0].split()[1])
mem_available = int([x for x in lines if 'MemAvailable:' in x][0].split()[1])

print(f"Total Memory: {mem_total / 1024:.2f} MB")
print(f"Available Memory: {mem_available / 1024:.2f} MB")

In [None]:
!python pl_pretrain.py

In [None]:
!nvidia-smi