In [1]:
%rm -r /usr/users/sdi1/sdi1_29/.cache/huggingface/datasets/wikipedia

In [1]:
%reload_ext autoreload
%autoreload 2

# Configuration

In [2]:
import os
import torch
from configs import CHARACTERS, DATASET_NAME, ENCODER_CONFIG, \
    DECODER_CONFIG, OPTIMIZER_CONFIG, SCHEDULER_CONFIG


use_cuda = torch.cuda.is_available()

max_train_steps = 1000
max_val_steps = 10

batch_size = 8
num_workers = 8

language = 'fr'
model_max_length = 128

characters = CHARACTERS[language]
dataset_name = DATASET_NAME[language]

encoder_name = 'SwinTransformerEncoder'
encoder_config = ENCODER_CONFIG[encoder_name]

decoder_name = 'AutoregressiveTransformerDecoder'
decoder_config = DECODER_CONFIG[decoder_name]

optimizer_name = 'AdamW'
optimizer_config = OPTIMIZER_CONFIG[optimizer_name]

scheduler_name = 'CosineLRScheduler'
scheduler_config = SCHEDULER_CONFIG[scheduler_name]

experiment_name = f"ocr_{language}_{model_max_length}_{encoder_name}_{decoder_name}_{optimizer_name}_{scheduler_name}"

if os.path.exists(f"checkpoints/{experiment_name}/"):
    print(f"Experiment {experiment_name} already exists, resuming training")
    ckpt_path = f"checkpoints/{experiment_name}/last.ckpt"
else:
    print(f"Starting new experiment {experiment_name}")
    ckpt_path = None

  from .autonotebook import tqdm as notebook_tqdm


Starting new experiment ocr_fr_128_SwinTransformerEncoder_AutoregressiveTransformerDecoder_AdamW_CosineLRScheduler


In [3]:
from tokenization.tokenizer import CharacterTokenizer
tokenizer = CharacterTokenizer(
    characters=characters,
    model_max_length=model_max_length,
)

In [4]:
from torchvision.transforms import Compose, Resize, Grayscale, ToTensor, Normalize
transform = Compose([
    Resize((encoder_config["params"]["height"],
            encoder_config["params"]["width"])),
    Grayscale(),
    ToTensor(),
    Normalize((0.5,), (0.5,)),
])

In [5]:
from dataset.wikipedia_dataset import WikipediaTextLineDataModule
datamodule = WikipediaTextLineDataModule(
    dataset_name=dataset_name,
    transform=transform,
    tokenizer=tokenizer,
    batch_size=batch_size,
    num_workers=num_workers,
    characters=characters,
)
datamodule.prepare_data()

Missing modules for handwritten text generation.


In [7]:
from modeling.lightning_wrapper import VisionEncoderLanguageDecoderWrapper
lightning_model = VisionEncoderLanguageDecoderWrapper(
    tokenizer=tokenizer,
    encoder_config=encoder_config,
    decoder_config=decoder_config,
    optimizer_config=optimizer_config,
    scheduler_config=scheduler_config,
)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [9]:
import pytorch_lightning as pl

prog_bar = pl.callbacks.progress.TQDMProgressBar(
    refresh_rate=1,
)

logger = pl.loggers.TensorBoardLogger(
    save_dir=f"logs/{experiment_name}/",
)

ckpt_callback = pl.callbacks.ModelCheckpoint(
    dirpath=f"checkpoints/{experiment_name}/",
    filename="checkpoint-{epoch:03d}-{val_cer:.5f}",
    monitor="val_cer",
    save_last=True,
    save_top_k=3,
    mode="min",
)

lr_monitor = pl.callbacks.LearningRateMonitor(
    logging_interval="step",
)

trainer = pl.Trainer(
    accelerator="gpu" if use_cuda else 'cpu',

    max_epochs=-1,
    log_every_n_steps=1,
    num_sanity_val_steps=1,

    limit_val_batches=max_val_steps,
    limit_train_batches=max_train_steps,

    callbacks=[ckpt_callback, lr_monitor, prog_bar],
    enable_progress_bar=True,
    logger=logger,
)

trainer.fit(
    model=lightning_model,
    datamodule=datamodule,
    ckpt_path=ckpt_path,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Found cached dataset wikipedia (/usr/users/sdi1/sdi1_29/.cache/huggingface/datasets/wikipedia/20220301.fr/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)
Found cached dataset wikipedia (/usr/users/sdi1/sdi1_29/.cache/huggingface/datasets/wikipedia/20220301.fr/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)
Found cached dataset wikipedia (/usr/users/sdi1/sdi1_29/.cache/huggingface/datasets/wikipedia/20220301.fr/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)
Found cached dataset wikipedia (/usr/users/sdi1/sdi1_29/.cache/huggingface/datasets/wikipedia/20220301.fr/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)
Loading cached processed dataset at /usr/users/sdi1/sdi1_29/.cache/huggingface/datasets/wikipedia/20220301.fr/2.0.0/aa542ed919df55cc5d3347f42

Epoch 0:   3%|▎         | 33/1000 [01:33<45:28,  2.82s/it, v_num=2, train_loss=3.540]  

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs