## Libraries

In [1]:
print(1)

1


In [2]:
!cp -r /home/msm97/scratch/models/llama2-7b/* $SLURM_TMPDIR/work/models/llama2

In [3]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel, AutoModelForImageClassification, ViTModel
from transformers import AutoTokenizer, AutoModelForCausalLM, DonutProcessor
from transformers import DonutProcessor, VisionEncoderDecoderModel, DonutSwinModel
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import Trainer, TrainingArguments
from PIL import Image
import numpy as np
from torchvision import transforms
from datasets import load_dataset, load_from_disk
from unids_unichart import DonutDataset
from torch.utils.data import DataLoader
import torch.optim as optim
from peft import get_peft_model, LoraConfig, TaskType
from peft import prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import os

## Paths

In [4]:
base_dir = '/'.join(os.getcwd().split('/')[:3])

vit_path = f'{base_dir}/work/models/vit'
llama_path = f'{base_dir}/work/models/llama2'
subset_path = '/home/msm97/pretrain/uni_subset'
mistral_path = f'{base_dir}/work/models/mistral'
gemma2_path = f'{base_dir}/work/models/gemma2'
unichart_path = f'{base_dir}/work/models/unichart/Encoder'
ds_path = f'{base_dir}/uniptds'
gemma22_path = f'{base_dir}/work/models/gemma2-2B'

image_folder = f'{base_dir}/content/tmp'

## Setting Device

In [5]:
device = "cuda:0"

In [6]:
!export TOKENIZERS_PARALLELISM=false

In [7]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

## Custom models

In [8]:
class vit_processor():
    def __init__(self, vision_processor, text_tokenizer):
        self.processor = vision_processor
        self.tokenizer = text_tokenizer

    def __call__(self, image):
        image_tensor = vision_transform(image).unsqueeze(0)
        return image_tensor

In [9]:
class unichart_processor():
    def __init__(self, vision_processor, text_tokenizer):
        self.processor = vision_processor
        self.tokenizer = text_tokenizer

    def __call__(self, image):
        image_tensor = self.processor(image, return_tensors="pt")
        return image_tensor

In [10]:
class AlignmentMLP(nn.Module):
    def __init__(self, vision_dim, text_dim, hidden_dim):
        super(AlignmentMLP, self).__init__()
        self.fc1 = nn.Linear(vision_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, text_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [11]:
class MultimodalModel(nn.Module):
    def __init__(self, vision_model, alignment_mlp, text_model):
        super(MultimodalModel, self).__init__()
        self.vision_model = vision_model
        self.alignment_mlp = alignment_mlp
        self.text_model = text_model

    def forward(self, pixel_values, input_ids):
        tmp = self.vision_model(pixel_values)
        vision_outputs = tmp.last_hidden_state
        aligned_features = self.alignment_mlp(vision_outputs)

        text_embeddings = self.text_model.get_input_embeddings()(input_ids)
        combined_features = torch.cat((aligned_features, text_embeddings), dim=1)

        outputs = self.text_model(inputs_embeds=combined_features)

        return outputs.logits
        
        # ######## INCLUDE IF USING TRAINER #############
        # logits = outputs.logits[:,900:]
        # logits = logits.contiguous().view(-1, logits.size(-1))
        # labels = labels.contiguous().view(-1)
        # criterion = nn.CrossEntropyLoss()
        # loss = criterion(logits, labels)
        # #return {"logits": outputs.logits, "loss": loss}
        # return loss
        # #####################################################

        # return {"logits": outputs.logits}

## Loading Models

In [12]:
!sq

            JOBID     USER              ACCOUNT           NAME  ST  TIME_LEFT NODES CPUS TRES_PER_N MIN_MEM NODELIST (REASON) 
         24172827    msm97       ctb-enamul_gpu    interactive   R    1:33:55     1   32 gres:gpu:a    125G gra1361 (None) 


In [13]:
# vision_model = ViTModel.from_pretrained(vit_path)
# image_processor = transforms.Compose([
#     transforms.Resize((384, 384)),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
# ])

In [14]:
processor = DonutProcessor.from_pretrained(f'{base_dir}/work/models/unichart/Encoder')
image_processor = processor.image_processor
vision_model = DonutSwinModel.from_pretrained(f'{base_dir}/work/models/unichart/Encoder')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
# outputdim 1024 for LLama, 2048 for gemma
# inputdim 768 for vit, 1024 for unichart
alignment_mlp = AlignmentMLP(vision_dim=1024, text_dim=4096, hidden_dim=8192)

In [16]:
text_tokenizer = AutoTokenizer.from_pretrained(llama_path)
text_model = LlamaForCausalLM.from_pretrained(llama_path)
# text_tokenizer = AutoTokenizer.from_pretrained(gemma22_path)
# text_model = AutoModelForCausalLM.from_pretrained(gemma22_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
text_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [18]:
text_tokenizer.pad_token

In [19]:
text_tokenizer.eos_token

'</s>'

In [20]:
text_tokenizer.pad_token = text_tokenizer.eos_token
text_tokenizer.padding_side = "right"

In [21]:
processor = unichart_processor(image_processor, text_tokenizer)

In [22]:
multimodal_model = MultimodalModel(vision_model, alignment_mlp, text_model)
multimodal_model.to(torch.bfloat16)
multimodal_model.to(device)

MultimodalModel(
  (vision_model): DonutSwinModel(
    (embeddings): DonutSwinEmbeddings(
      (patch_embeddings): DonutSwinPatchEmbeddings(
        (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DonutSwinEncoder(
      (layers): ModuleList(
        (0): DonutSwinStage(
          (blocks): ModuleList(
            (0-1): 2 x DonutSwinLayer(
              (layernorm_before): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
              (attention): DonutSwinAttention(
                (self): DonutSwinSelfAttention(
                  (query): Linear(in_features=128, out_features=128, bias=True)
                  (key): Linear(in_features=128, out_features=128, bias=True)
                  (value): Linear(in_features=128, out_features=128, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                

In [23]:
!nvidia-smi

Sat Aug 17 16:03:01 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   47C    P0              73W / 300W |  13515MiB / 81920MiB |     23%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:65:00.0 Off |  

In [24]:
def freeze_model_weights(model):
    for param in model.parameters():
        param.requires_grad = False


In [25]:
def print_model_summary(model):
    print(f"{'Layer':<20} {'Output Shape':<25} {'Param #':<15}")
    print("=" * 60)
    total_params = 0
    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue
        param_shape = list(param.shape)
        param_count = param.numel()
        total_params += param_count
        print(f"{name:<20} {str(param_shape):<25} {param_count:<15}")
    print("=" * 60)
    print(f"Total Trainable Params: {total_params}")

# Print the model summary
print_model_summary(multimodal_model)

Layer                Output Shape              Param #        
vision_model.embeddings.patch_embeddings.projection.weight [128, 3, 4, 4]            6144           
vision_model.embeddings.patch_embeddings.projection.bias [128]                     128            
vision_model.embeddings.norm.weight [128]                     128            
vision_model.embeddings.norm.bias [128]                     128            
vision_model.encoder.layers.0.blocks.0.layernorm_before.weight [128]                     128            
vision_model.encoder.layers.0.blocks.0.layernorm_before.bias [128]                     128            
vision_model.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table [361, 4]                  1444           
vision_model.encoder.layers.0.blocks.0.attention.self.query.weight [128, 128]                16384          
vision_model.encoder.layers.0.blocks.0.attention.self.query.bias [128]                     128            
vision_model.encoder.layers.0.bloc

In [26]:
freeze_model_weights(multimodal_model.vision_model)
freeze_model_weights(multimodal_model.text_model)

In [27]:
for name, param in multimodal_model.named_parameters():
    print(f"{name}: {param.requires_grad}")

vision_model.embeddings.patch_embeddings.projection.weight: False
vision_model.embeddings.patch_embeddings.projection.bias: False
vision_model.embeddings.norm.weight: False
vision_model.embeddings.norm.bias: False
vision_model.encoder.layers.0.blocks.0.layernorm_before.weight: False
vision_model.encoder.layers.0.blocks.0.layernorm_before.bias: False
vision_model.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table: False
vision_model.encoder.layers.0.blocks.0.attention.self.query.weight: False
vision_model.encoder.layers.0.blocks.0.attention.self.query.bias: False
vision_model.encoder.layers.0.blocks.0.attention.self.key.weight: False
vision_model.encoder.layers.0.blocks.0.attention.self.key.bias: False
vision_model.encoder.layers.0.blocks.0.attention.self.value.weight: False
vision_model.encoder.layers.0.blocks.0.attention.self.value.bias: False
vision_model.encoder.layers.0.blocks.0.attention.output.dense.weight: False
vision_model.encoder.layers.0.blocks.0.attention

In [28]:
!nvidia-smi

Sat Aug 17 16:03:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   47C    P0              73W / 300W |  13515MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:65:00.0 Off |  

In [29]:
# Read the memory info from /proc/meminfo
with open('/proc/meminfo', 'r') as meminfo:
    lines = meminfo.readlines()

# Extract total and available memory in kB
mem_total = int([x for x in lines if 'MemTotal:' in x][0].split()[1])
mem_available = int([x for x in lines if 'MemAvailable:' in x][0].split()[1])

print(f"Total Memory: {mem_total / 1024:.2f} MB")
print(f"Available Memory: {mem_available / 1024:.2f} MB")


Total Memory: 257419.61 MB
Available Memory: 246567.96 MB


## Loading Data

In [40]:
!sq

            JOBID     USER              ACCOUNT           NAME  ST  TIME_LEFT NODES CPUS TRES_PER_N MIN_MEM NODELIST (REASON) 
         24172827    msm97       ctb-enamul_gpu    interactive   R    1:24:52     1   32 gres:gpu:a    125G gra1361 (None) 


In [31]:
# Summarization Start Index: 2768876
# Summarization End Index: 3222856 (Inclusive)

# Chart2table Start Index: 5637216

#dataset size: 6898333

In [32]:
window = list(range(64))

In [33]:
huh_tr = DonutDataset(subset_path, image_folder, 1024, processor, split = 'train', prompt_end_token = '<s_answer>', indices = window)
huh_ev = DonutDataset(subset_path, image_folder, 1024, processor, prompt_end_token = '<s_answer>', indices = list(range(24,28)))

In [34]:
dataloader = DataLoader(huh_tr, batch_size=2, shuffle=False, num_workers=4)

In [35]:
traindl, evaldl = DataLoader(huh_tr, batch_size=1, shuffle=True), DataLoader(huh_ev, batch_size=2, shuffle=True)

In [39]:
huh_tr[3]

<summarize_chart> <s_answer> Line 3 begins at ylabel below the other lines then increases and trends above them, then it decreases below line 1 and ends with line 2. Line 1 begins above line 2 and briefly crosses below it. Line 4 trends below line 1 and line 2.</s>  ^^&& 


{'pixel_values': tensor([[[-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          ...,
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.]],
 
         [[-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          ...,
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.]],
 
         [[-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          ...,
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.]]], dtype=torch.bfloat16),
 'input_ids': tensor([ 529, 2083, 3034,  ...

## Hyperparameters

In [77]:
num_epochs = 3

In [78]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(multimodal_model.parameters(), lr=0.001)

In [79]:
!nvidia-smi

Sat Aug 17 15:54:47 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   44C    P0              72W / 300W |  26831MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:65:00.0 Off |  

In [80]:
STOP

NameError: name 'STOP' is not defined

In [None]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     save_strategy='no',
#     num_train_epochs=2,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     warmup_steps=0,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=1,
#     evaluation_strategy="epoch",
#     # save_strategy="epoch",
#     # load_best_model_at_end=True,
#     # save_total_limit=3,
#     # # Use multiple GPUs
#     dataloader_num_workers=1,
#     fp16=True,  # Enable 16-bit precision training if applicable
#     # report_to="none",
#     gradient_accumulation_steps=2,
# )
# for i in range(torch.cuda.device_count()):
#     print(f"GPU {i}:")
#     print(f"  Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
#     print(f"  Reserved: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
#     print(f"  Total: {torch.cuda.get_device_properties(i).total_memory / 1024**2:.2f} MB")

# trainer = Trainer(
#     model=multimodal_model,
#     args=training_args,
#     train_dataset=huh_tr,
#     eval_dataset=huh_ev,
# )

# # Start training
# trainer.train()

In [81]:
!nvidia-smi

Sat Aug 17 15:54:48 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   44C    P0              72W / 300W |  26831MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:65:00.0 Off |  

In [83]:
for epoch in range(num_epochs):
    # Training phase
    multimodal_model.train()
    running_loss = 0.0
    print(f" ============ Epoch {epoch+1} of {num_epochs} ========== ")
    for batch in dataloader:
        #print(f"Initial VRAM usage: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
        pixel_values, input_ids, labels = batch['pixel_values'].to(device), batch['input_ids'].to(device), batch['labels'].to(device)
        optimizer.zero_grad()
        # Forward pass
        outputs = multimodal_model(pixel_values,input_ids)

        logits = outputs[:,900:]
        logits = logits.contiguous().view(-1, logits.size(-1))  # Shape: [1024, 128256]
        labels = labels.contiguous().view(-1)
        loss = criterion(logits, labels)
        print(f"loss {loss}")
        #print(f"VRAM after forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
        
        loss.backward()
        #print(f"VRAM after backward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
        optimizer.step()
        #print(f"VRAM after optimizer step: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
        #print(f"Peak VRAM usage: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")

        # Accumulate loss
        running_loss += loss.item()
        #print("====================================")
    
    train_loss = running_loss / len(dataloader)
    print(f"************************* Epoch {epoch}: Trainloss: {train_loss} *****************")
    
    multimodal_model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in evaldl:

            pixel_values, input_ids, labels = batch['pixel_values'].to(device), batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = multimodal_model(pixel_values,input_ids)
            logits = outputs[:,900:]
            logits = logits.contiguous().view(-1, logits.size(-1))  # Shape: [1024, 128256]
            labels = labels.contiguous().view(-1)
            loss = criterion(logits, labels)
            print("Validation Loss: ", loss, " !! ")  
            val_loss += loss.item()
    
    val_loss = val_loss / len(evaldl)
    # val_accuracy = correct / total

loss 0.09362368285655975
loss 0.10282735526561737
loss 0.0988401547074318
loss 0.03282114863395691
loss 0.03273181617259979
loss 0.10203124582767487
loss 0.029839353635907173
loss 0.05139192193746567
loss 0.016986433416604996
loss 0.04657871276140213
loss 0.1703561246395111
loss 0.14610302448272705
loss 0.14198026061058044
loss 0.031134681776165962
loss 0.015816669911146164
loss 0.07521851360797882
loss 0.041395582258701324
loss 0.0634993389248848
loss 0.011329376138746738
loss 0.010382653214037418
loss 0.01367771252989769
loss 0.010389271192252636
loss 0.012860114686191082
loss 0.09800931066274643
loss 0.04745948687195778
loss 0.08836512267589569
loss 0.020883165299892426
loss 0.023648787289857864
loss 0.03708216920495033
loss 0.010708213783800602
loss 0.12726552784442902
loss 0.004853891674429178
************************* Epoch 0: Trainloss: 0.05656533826550003 *****************
Validation Loss:  tensor(0.0626, device='cuda:0')  !! 
Validation Loss:  tensor(0.0656, device='cuda:0')  

In [84]:
dataset_dict = load_from_disk(subset_path)
dataset = dataset_dict.select(window)

In [85]:
dataset[0]

{'imgname': 'linecap_1410.6793v2-Figure67-1.png',
 'query': '<summarize_chart>',
 'label': 'ylabel increases at an increasing rate before plateauing at 0.7 on the xlabel'}

In [86]:
test_loader1 = DataLoader(huh_tr, batch_size=2, shuffle=False, num_workers=4)

In [87]:
texts = []

In [88]:
multimodal_model.eval()
for batch in test_loader1:
    pixel_values, input_ids, labels = batch['pixel_values'].to(device), batch['input_ids'].to(device), batch['labels'].to(device)
    outputs = multimodal_model(pixel_values,input_ids)
    logits = outputs[:,900:]
    token_ids = torch.argmax(logits, dim=-1)
    decoded_texts = text_tokenizer.batch_decode(token_ids)
    for text in decoded_texts:
        texts.append(text)

In [89]:
for i, text in enumerate(texts):
    print(i, " !\n ", dataset[i]['label'], " @@\n ", text)

0  !
  ylabel increases at an increasing rate before plateauing at 0.7 on the xlabel  @@
  <summarize_chart> <s_answer> ylabel increases at an increasing rate before plateauing at 0.7 on the xlabel................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. instead.... instead. instead.. instead instead..... instead. instead.. instead instead instead instead... instead instead instead instead.. instead instead instead instead instead instead. instead instead. instead instead instead instead ins

In [99]:
huh_te[0]

{'pixel_values': tensor([[[-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          ...,
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.]],
 
         [[-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          ...,
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.]],
 
         [[-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          ...,
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.],
          [-1., -1., -1.,  ..., -1., -1., -1.]]], dtype=torch.bfloat16),
 'input_ids': tensor([ 529, 2083, 3034,  ...

In [98]:
text_tokenizer.eos_token

'</s>'

## Out of trainset test

In [90]:
window2 = list(range(128,160))

In [91]:
dataset_dict = load_from_disk(subset_path)
dataset = dataset_dict.select(window2)

In [92]:
huh_te = DonutDataset(subset_path, image_folder, 1024, processor, split = 'train', prompt_end_token = '<s_answer>', indices = list(window2))

In [93]:
test_loader2 = DataLoader(huh_te, batch_size=2, shuffle=False, num_workers=4)

In [94]:
texts = []
multimodal_model.eval()
for batch in test_loader2:
    pixel_values, input_ids, labels = batch['pixel_values'].to(device), batch['input_ids'].to(device), batch['labels'].to(device)
    outputs = multimodal_model(pixel_values,input_ids)
    logits = outputs[:,900:]
    token_ids = torch.argmax(logits, dim=-1)
    decoded_texts = text_tokenizer.batch_decode(token_ids)
    for text in decoded_texts:
        texts.append(text)

In [95]:
for i, text in enumerate(texts):
    print(i, " !\n ", dataset[i]['label'], " @@\n ", text)

0  !
  Lines 2 and 4 overlap at bottom ylabel value. Line 3 trends above lines 2 and 4 and below line 1.  @@
  <summarize_chart> <s_answer> Lines 2 and 4 overlap at bottom ylabel value. Line 3 trends above lines 2 and 4 and below line 1............................................... etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc. etc... except.. except. except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except except e

In [None]:
!nvidia-smi

# Test

In [None]:
# Read the memory info from /proc/meminfo
with open('/proc/meminfo', 'r') as meminfo:
    lines = meminfo.readlines()

# Extract total and available memory in kB
mem_total = int([x for x in lines if 'MemTotal:' in x][0].split()[1])
mem_available = int([x for x in lines if 'MemAvailable:' in x][0].split()[1])

print(f"Total Memory: {mem_total / 1024:.2f} MB")
print(f"Available Memory: {mem_available / 1024:.2f} MB")

In [None]:
!python pl_pretrain.py

In [None]:
!python UniGemmaPretrain.py

In [None]:
!nvidia-smi

In [None]:
!mkdir /home/msm97/projects/def-enamul/msm97/Codebases/

In [None]:
!mkdir /home/msm97/scratch/results

In [None]:
os.getcwd()

In [None]:
!cp -r $SLURM_TMPDIR/work/codes 