In [4]:
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
import requests
import torch
from PIL import Image
import json
import numpy as np
from torch.utils.data import Dataset
import pandas as pd
import matplotlib.pyplot as plt
from torch.optim import AdamW
import transformers
from transformers import AutoModelForCausalLM,AutoProcessor,get_scheduler

2025-10-16 11:00:22.711317: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [1]:
%pip install --upgrade -q transformers==4.53.3
%pip install einops timm datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [5]:
class TextRecognitionDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        question = '<OCR>'
        image = Image.open(f"{row['image_path']}").convert("RGB")
        labels = str(row['text'])
        return question,image,labels
def collate_fn(batch):
    questions,images, labels = zip(*batch)
    inputs = processor(text=list(questions), images=list(images), return_tensors="pt", padding=True)
    return inputs, labels

In [7]:
def train_model(train_loader, val_loader, test_loader, model, processor, epochs=10, lr=1e-5, log_every=200):
    model_dtype = next(model.parameters()).dtype
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_loader)
    max_length = 1024
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    best_val_loss = float('inf')
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")):
            
            inputs, answers = batch
        
            input_ids = inputs["input_ids"].to(device)
            pixel_values = inputs["pixel_values"].to(device, dtype=model_dtype)
            labels = processor.tokenizer(
            answers,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_token_type_ids=False).input_ids.to(device)
            labels[labels == processor.tokenizer.pad_token_id] = -100

            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) 
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            train_loss += loss.item()
            if (batch_idx + 1) % log_every == 0:
                print(f"  Batch {batch_idx + 1}/{len(train_loader)} | Loss: {loss.item():.4f}")

        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch + 1} | Avg Train Loss: {avg_train_loss:.4f}")

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                inputs, answers = batch
                input_ids = inputs["input_ids"].to(device)
                pixel_values = inputs["pixel_values"].to(device, dtype=model_dtype)
              
                labels = processor.tokenizer(
                    answers,
                    return_tensors="pt",
                    padding="max_length",
                    truncation=True,
                    max_length=max_length,
                    return_token_type_ids=False
                ).input_ids.to(device)

                labels[labels == processor.tokenizer.pad_token_id] = -100

                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
                val_loss += outputs.loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch + 1} | Avg Val Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            output_dir = f"./modelFNEW_DOC_checkpoints_epoch{epoch+1}"
            os.makedirs(output_dir, exist_ok=True)
            model.save_pretrained(output_dir)
            processor.save_pretrained(output_dir)
            print("Saved new best model!")

    model.eval()
    test_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            inputs, answers = batch
            input_ids = inputs["input_ids"].to(device)
            pixel_values = inputs["pixel_values"].to(device, dtype=model_dtype)

            labels = processor.tokenizer(
                answers,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=max_length,
                return_token_type_ids=False
            ).input_ids.to(device)

            labels[labels == processor.tokenizer.pad_token_id] = -100

            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            test_loss += outputs.loss.item()

    avg_test_loss = test_loss / len(test_loader)
    print(f"Final Test Loss: {avg_test_loss:.4f}")
    return model

In [8]:
df = pd.read_csv("/home/jupyter/project/metadata_big_train.csv")
df_val = pd.read_csv("/home/jupyter/project/metadata_big_val.csv")

In [5]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f4bcc8e4790>

In [9]:
train_dataset = TextRecognitionDataset(df)
val_dataset = TextRecognitionDataset(df_val.iloc[:len(df_val)// 2])
test_dataset = TextRecognitionDataset(df_val.iloc[len(df_val)// 2:])

In [10]:
batch_size = 8
num_workers = 14
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers,persistent_workers=True, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=8,persistent_workers=True,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=8,persistent_workers=True,shuffle=True)

In [13]:
model_path = "/home/jupyter/project/modelFBIG_DOC_checkpoints_epoch1"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
).to(device)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True,)

In [12]:
model_dtype = next(model.parameters()).dtype

torch.float32

In [24]:
!nvidia-smi

Wed Oct 15 11:33:29 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.261.03             Driver Version: 535.261.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       On  | 00000000:8C:00.0 Off |                    0 |
| N/A   28C    P0              25W /  70W |   3225MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
train_model(train_loader,val_loader,test_loader,model,processor, epochs=7)

Epoch 1/7:  23%|██▎       | 200/875 [11:44<39:09,  3.48s/it]

  Batch 200/875 | Loss: 1.2228


Epoch 1/7:  46%|████▌     | 400/875 [23:19<27:32,  3.48s/it]

  Batch 400/875 | Loss: 1.0480


Epoch 1/7:  69%|██████▊   | 600/875 [34:55<15:56,  3.48s/it]

  Batch 600/875 | Loss: 0.8910


Epoch 1/7:  91%|█████████▏| 800/875 [46:31<04:21,  3.48s/it]

  Batch 800/875 | Loss: 0.8241


Epoch 1/7: 100%|██████████| 875/875 [50:52<00:00,  3.49s/it]


Epoch 1 | Avg Train Loss: 1.0344


Validation:   0%|          | 0/125 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Epoch 1 | Avg Val Loss: 0.6812





✅ Saved new best model!


Epoch 2/7:  23%|██▎       | 200/875 [11:36<39:12,  3.48s/it]

  Batch 200/875 | Loss: 0.6796


Epoch 2/7:  46%|████▌     | 400/875 [23:12<27:34,  3.48s/it]

  Batch 400/875 | Loss: 0.7183


Epoch 2/7:  69%|██████▊   | 600/875 [34:48<15:56,  3.48s/it]

  Batch 600/875 | Loss: 0.6770


Epoch 2/7:  91%|█████████▏| 800/875 [46:24<04:21,  3.48s/it]

  Batch 800/875 | Loss: 0.6156


Epoch 2/7: 100%|██████████| 875/875 [50:45<00:00,  3.48s/it]


Epoch 2 | Avg Train Loss: 0.6935


Validation: 100%|██████████| 125/125 [02:24<00:00,  1.16s/it]

Epoch 2 | Avg Val Loss: 0.5341





✅ Saved new best model!


Epoch 3/7:  23%|██▎       | 200/875 [11:36<39:12,  3.49s/it]

  Batch 200/875 | Loss: 0.6073


Epoch 3/7:  46%|████▌     | 400/875 [23:12<27:32,  3.48s/it]

  Batch 400/875 | Loss: 0.5660


Epoch 3/7:  69%|██████▊   | 600/875 [34:48<15:56,  3.48s/it]

  Batch 600/875 | Loss: 0.5179


Epoch 3/7:  91%|█████████▏| 800/875 [46:24<04:21,  3.48s/it]

  Batch 800/875 | Loss: 0.5400


Epoch 3/7: 100%|██████████| 875/875 [50:45<00:00,  3.48s/it]


Epoch 3 | Avg Train Loss: 0.5605


Validation: 100%|██████████| 125/125 [02:24<00:00,  1.16s/it]

Epoch 3 | Avg Val Loss: 0.4588





✅ Saved new best model!


Epoch 4/7:  23%|██▎       | 200/875 [11:36<39:13,  3.49s/it]

  Batch 200/875 | Loss: 0.5115


Epoch 4/7:  46%|████▌     | 400/875 [23:12<27:33,  3.48s/it]

  Batch 400/875 | Loss: 0.4929


Epoch 4/7:  69%|██████▊   | 600/875 [34:48<15:56,  3.48s/it]

  Batch 600/875 | Loss: 0.4666


Epoch 4/7:  91%|█████████▏| 800/875 [46:25<04:21,  3.48s/it]

  Batch 800/875 | Loss: 0.4467


Epoch 4/7: 100%|██████████| 875/875 [50:46<00:00,  3.48s/it]


Epoch 4 | Avg Train Loss: 0.4834


Validation: 100%|██████████| 125/125 [02:24<00:00,  1.16s/it]

Epoch 4 | Avg Val Loss: 0.4142





✅ Saved new best model!


Epoch 5/7:  10%|▉         | 86/875 [04:59<45:48,  3.48s/it]

In [9]:
import torch, gc
model = None
processor = None
result_model = None
gc.collect()
torch.cuda.empty_cache()
print("GPU memory should be freed (but restart kernel to be sure)")

GPU memory should be freed (but restart kernel to be sure)
