In [None]:
!pip install -r requirements.txt

In [None]:
import yaml
from csl.obow.obow.datasets import show_data_for_obow
from utils.data import Standard_DataLoader
from copy import deepcopy
import torchvision.transforms as T
import matplotlib.pyplot as plt

# OBoW pre-training

In this section, you will be able to pretrain a ResNet18 encoder via the Self-Supervised Learning (SSL) approach "[Online Bag of Visual Words](https://arxiv.org/pdf/2012.11552.pdf)" (OBoW) on a dataset of manuscripts. The experimental results presented in [this work](https://www.sciencedirect.com/science/article/pii/S0306457322000097) show that SSL pretraining on such pretext task improves the performance of the encoder on the Handwriting Identification (HI) task.

## Dataset and experiment results

The manuscript pages must be in JPEG or PNG format. At this stage, also unlabeled manuscripts (i.e., whose pages are not annotated with the scribe ID) can be used to increase the dataset size (which is useful for the success of the SSL method). Images should be arranged as the following example

dataset
- train
    - Manuscript 1
        - page 1
        - page 2
        - ...
    - Manuscript 2
        - page 3
        - page 4
        - ...    
    - ...  
- val
    - Manuscript 1
        - page 5
        - page 6
        - ...
    - Manuscript 2
        - page 7
        - page 8
        - ...    
    - ... 
- test
    - Manuscript 1
        - page 9
        - page 10
        - ...
    - Manuscript 2
        - page 11
        - page 12
        - ...    
    - ... 

Dataset directory

In [None]:
dataset_dir = './manuscript_dataset'       # Set here the dataset path

Output directory (logs, checkpoints)

In [None]:
dest_dir = './obow_results'       # Set here the output path

## Pretext task configuration

References for the parameter selection
- [OBoW: Online Bag-of-Visual-Words Generation for Self-Supervised Learning](https://arxiv.org/pdf/2012.11552.pdf)
- [OBoW GitHub repository](https://github.com/valeoai/obow)
- [Self-supervised learning for medieval handwriting identification: A case study from the Vatican Apostolic Library](https://www.sciencedirect.com/science/article/pii/S0306457322000097)
- [TORCHVISION.TRANSFORMS](https://pytorch.org/vision/0.9/transforms.html)

Load the base configuration file to modify as you wish

In [None]:
with open('csl/obow/config/Optional/Op.yaml', "r") as f:
    exp_config = yaml.load(f, Loader=yaml.SafeLoader)

Model parameters

In [None]:
exp_config['model']['alpha'] = 0.99
exp_config['model']['alpha_cosine'] = True
exp_config['model']['feature_extractor_arch'] = 'resnet18'
exp_config['model']['feature_extractor_opts']['global_pooling'] = True
exp_config['model']['bow_levels'] = ['block3', 'block4']
exp_config['model']['bow_extractor_opts']['inv_delta'] = 15
exp_config['model']['bow_extractor_opts']['num_words'] = 8192
exp_config['model']['bow_predictor_opts']['kappa'] = 8
exp_config['model']['num_classes'] = 24       # Change here based on the number of classes of your dataset

Optimization parameters

In [None]:
exp_config['optim']['optim_type'] = 'sgd'
exp_config['optim']['momentum'] = 0.9
exp_config['optim']['weight_decay'] = 0.0001
exp_config['optim']['nesterov'] = False
exp_config['optim']['num_epochs'] = 100 
exp_config['optim']['lr'] = 0.03 
exp_config['optim']['end_lr'] = 0.00003
exp_config['optim']['lr_schedule_type'] = 'cos_warmup'
exp_config['optim']['warmup_epochs'] = 5
exp_config['optim']['permanent'] = 100       # Save a permanent checkpoint every N epochs

Data augmentation parameters

In [None]:
exp_config['data']['batch_size'] = 64       # Batch size
exp_config['data']['cjitter'] = [[0.4, 1.3], 0.6, 0.6, 0.4]       # Color jitter parameters
exp_config['data']['cjitter_p'] = 1       # Probability of using color jittering 
exp_config['data']['randaffine'] = [10, [0.2, 0.2], [1.3, 1.4], 1]       # Random affine transformation
exp_config['data']['randpersp'] = [0.1, 0.2]       # Random perspective transformation
exp_config['data']['gray_p'] = 0.2       # Probability of converting an image to grayscale
exp_config['data']['gaussian_blur'] = [3, [0.1, 0.5]]       # Gaussian blur parameters
exp_config['data']['target_img_size'] = 380       # Size of the image extracted from a given page x and fed to the teacher network
exp_config['data']['num_img_crops'] = 2       # Number of random crops extracted from page x and fed to the student network
exp_config['data']['image_crop_size'] = 270       # Size of the M random crops extracted from page x
exp_config['data']['num_img_patches'] = 5       # Number of patches extracted from page x and fed to the student network
exp_config['data']['img_patch_preresize'] = 256       # Size of the region of the page from which the K patches are extracted
exp_config['data']['img_patch_size'] = 150       # Size of the K patches extracted from page x
exp_config['data']['img_patch_jitter'] = 24       # Parameter that regulates the patch extraction stage
exp_config['data']['rand_eras'] = [0.5, [0.02, 0.33], [0.3, 3.3], 0]       # Parameters of the random erasing transformation applied to crops
exp_config['data']['rand_eras_patch'] = [0.7, [0.02, 0.1], [0.3, 3.3], 0]       # Parameters of the random erasing transformation applied to patches
exp_config['data']['invert_p'] = 0.05       # Probability of inverting the colors of the RGB image

## Check augmentations

In [None]:
exp_config_backup = deepcopy(exp_config['data'])

del exp_config_backup['dataset_name']
del exp_config_backup['batch_size']
del exp_config_backup['epoch_size']

dataset_train, dataset_val = show_data_for_obow(dataset_dir, **exp_config_backup)

TTP = T.ToPILImage()

print(f'Number of samples in training dataset: {len(dataset_train)}')

Choose an image to visualize, together with the corresponding crops and patches

In [None]:
idx = input(f'Enter an index in the range 0 - {len(dataset_train) - 1}: ')
sample = dataset_train[int(idx)][0]

target, crops, patches = sample

Target Image

In [None]:
fig = plt.figure(figsize=(4, 4))
plt.imshow(TTP(target))
plt.axis('off')
plt.show()

Crops

In [None]:
columns = len(crops)
fig = plt.figure(figsize=(8, 8))
for i in range(1, columns +1):
    img = TTP(crops[i-1])
    fig.add_subplot(1, columns, i)
    plt.axis('off')
    plt.imshow(img)
plt.show()

Patches

In [None]:
columns = len(patches)
fig = plt.figure(figsize=(8, 8))
for i in range(1, columns +1):
    img = TTP(patches[i-1])
    fig.add_subplot(1, columns, i)
    plt.axis('off')
    plt.imshow(img)
plt.show()

## Save the new configuration file

In [None]:
exp_name = 'OBoW_0'       # Change here to set the name of your experiment
config_path = f'csl/obow/config/{exp_name}.yaml'

with open(config_path, 'w') as outfile:
    yaml.safe_dump(exp_config, outfile, default_flow_style = False, sort_keys = False)

## Run OBoW pre-training and save checkpoints

In [None]:
!python csl/obow/main_obow.py --config={exp_name} --workers=0 -p=100 --dst-dir={dest_dir} --data-dir={dataset_dir}

Convert the checkpoints into torchvision format

In [None]:
!python csl/obow/main_obow.py --config={exp_name} --workers=0 -p=100 --dst-dir={dest_dir} --data-dir={dataset_dir} --convert-to-torchvision

# Fine-tuning

In this section, you will be able to alternatively train from scratch a ResNet18 encoder or fine-tune it (after a preliminary pretraining either on the ImageNet dataset or via OboW) on a dataset of manuscript pages (each one annotated with the ID of the scribe who wrote it), with the aim of learning well-separated clusters of scribes (Handwriting Identification, HI).

Such downstream task consists of training the network to minimize a triplet margin loss: hence, the HI task is configured as a **metric learning** problem, and **not** as a **classification** one.

## Dataset and experiment results

The manuscript pages must be in JPEG or PNG format. At this stage, the **labeld** manuscripts only (i.e., whose pages are annotated with the scribe ID) must be used. Images should be arranged as the following example

dataset
- train
    - Scribe A
        - page 1
        - page 2
        - ...
    - Scribe B
        - page 3
        - page 4
        - ...    
    - ...  
- val
    - Scribe A
        - page 5
        - page 6
        - ...
    - Scribe B
        - page 7
        - page 8
        - ...    
    - ... 
- test
    - Scribe A
        - page 9
        - page 10
        - ...
    - Scribe B
        - page 11
        - page 12
        - ...    
    - ... 
    
The test set can also include a different set of scribes than those used for fine-tuning the network, in order to assess the generalization power of the network. In this case, the test set will be structured as follows

dataset
- test
    - Scribe α
        - page 1
        - page 2
        - ...
    - Scribe β
        - page 11
        - page 12
        - ...    
    - ... 

Dataset directory

In [None]:
dataset_dir_HI = './scribe_dataset'       # Set here the dataset path for Handwriting Identification

## Downstream task configuration

References for the parameter selection
- [Self-supervised learning for medieval handwriting identification: A case study from the Vatican Apostolic Library](https://www.sciencedirect.com/science/article/pii/S0306457322000097)
- [TORCH.OPTIM](https://pytorch.org/docs/stable/optim.html)
- [LINEAR WARMUP COSINE ANNEALING](https://pytorch-lightning-bolts.readthedocs.io/en/stable/schedulers/warmup_cosine_annealing.html)
- [Triplet Loss and Online Triplet Mining in TensorFlow](https://omoindrot.github.io/triplet-loss)
- [TORCHVISION.TRANSFORMS](https://pytorch.org/vision/0.9/transforms.html)

Load the base configuration file to modify as you wish

In [None]:
with open('config/config.yaml', "r") as f:
    exp_config_HI = yaml.load(f, Loader=yaml.SafeLoader)

General parameters

In [None]:
exp_config_HI['general']['test_id'] = 'HI_0'       # Change here to set the name of your experiment
exp_config_HI['general']['seed'] = 1       # Global seed

Model parameters

In [None]:
exp_config_HI['model']['num_classes'] = 23       # Number of classes (scribes) in the training set
exp_config_HI['model']['emb_width'] = 1024       # Size of the page embeddings
exp_config_HI['model']['pretraining'] = 'obow'       # Pretraining type (either "obow", "imagenet", or None)
exp_config_HI['model']['mode'] = 'frozen'       # Set to "frozen" to freeze the backbone model weights and train the final linear layers only
exp_config_HI['model']['cp_path'] = f'{dest_dir}/{exp_name}/checkpoints.pth.tar'       # If "pretraining" is set to "obow", change here to the corresponding model checkpoints path

Optimization parameters

In [None]:
exp_config_HI['optim']['optim_type'] = 'sgd'       # Optimizer type (either "sgd" or "adam")
exp_config_HI['optim']['momentum'] = 0.9
exp_config_HI['optim']['weight_decay'] = 0.0001
exp_config_HI['optim']['nesterov'] = False
exp_config_HI['optim']['num_epochs'] = 100       # Number of epochs
exp_config_HI['optim']['lr'] = 0.6
exp_config_HI['optim']['beta'] = [0.9, 0.999]
exp_config_HI['optim']['end_lr'] = 0.0015
exp_config_HI['optim']['lr_schedule_type'] = 'cos_warmup'       # Scheduler type (either "cos_warmup", "step_lr", "exp", or "red_on_plateau")
exp_config_HI['optim']['step'] = 10
exp_config_HI['optim']['gamma'] = 0.1
exp_config_HI['optim']['patience'] = 10
exp_config_HI['optim']['warmup_epochs'] = 10
exp_config_HI['optim']['warmup_start_lr'] = 0.15
exp_config_HI['optim']['loss']['margin'] = 0.2       # Margin parameter of the triplet margin loss
exp_config_HI['optim']['loss']['squared'] = False        # Squared euclidean embedding distance (True) vs euclidean embedding distance (False)

Data augmentation parameters

In [None]:
exp_config_HI['data']['batch_size'] = 256       # Batch size
exp_config_HI['data']['transforms']['img_crop_size'] = 380       # Size of the image extracted from a given page x and fed to the network
exp_config_HI['data']['transforms']['cjitter'] = {       # Color jitter parameters
    'brightness': [0.4, 1.3],
    'contrast': 0.6, 
    'saturation': 0.6,
    'hue': 0.4}
exp_config_HI['data']['transforms']['cjitter_p'] = 1       # Probability of using color jittering 
exp_config_HI['data']['transforms']['randaffine'] = {       # Random affine transformation
    'degrees': [-10,10],
    'translate': [0.2, 0.2],
    'scale': [1.3, 1.4],
    'shear': 1}
exp_config_HI['data']['transforms']['randpersp'] = {       # Random perspective transformation
    'distortion_scale': 0.1,
    'p': 0.2}
exp_config_HI['data']['transforms']['gray_p'] = 0.2       # Probability of converting an image to grayscale
exp_config_HI['data']['transforms']['gaussian_blur'] = {       # Gaussian blur parameters
    'kernel_size': 3,
    'sigma': [0.1, 0.5]}
exp_config_HI['data']['transforms']['rand_eras'] = {       # Parameters of the random erasing transformation
    'p': 0.5,
    'scale': [0.02, 0.33],
    'ratio': [0.3, 3.3],
    'value': 0}
exp_config_HI['data']['transforms']['invert_p'] = 0.05       # Probability of inverting the colors of the RGB image
exp_config_HI['data']['transforms']['n_test_crops'] = 10       # Number of crops extracted from page x and used to generate an average embedding of such page at test time
exp_config_HI['data']['weighted_sampling'] = False       # Weighted sampling option

Test parameters

In [None]:
exp_config_HI['test']['ratio_train'] = 50       # Percentage of the training samples involved in the performance assessment
exp_config_HI['test']['ratio_val'] = 100       # Percentage of the validation/test samples involved in the performance assessment

## Check augmentations

In [None]:
SD = Standard_DataLoader(f'{dataset_dir_HI}/train',
                             exp_config_HI['data']['transforms'],
                             batch_size = 8,
                             weighted_sampling = False,
                             phase = 'train',
                             mean = [1e-9, 1e-9, 1e-9],
                             std = [1, 1, 1],
                             shuffle = False, 
                             amount = 0.3, 
                             selection = False)

dataset_train_HI, _ = SD.load_data()

TTP = T.ToPILImage()

Choose an image to visualize

In [None]:
idx = input(f'Enter an index in the range 0 - {len(dataset_train_HI) - 1}: ')

TTP(dataset_train_HI[int(idx)][0])

## Save the new configuration file

In [None]:
config_path_HI = f'config/{exp_config_HI["general"]["test_id"]}.yaml'

with open(config_path_HI, 'w') as outfile:
    yaml.safe_dump(exp_config_HI, outfile, default_flow_style = False, sort_keys = False)

## Fine-tune the encoder on the HI task and save checkpoints

When the process is launched, two folders are created:
- **data**, where losses and plots are saved
- **model/checkpoints**, where checkpoints are saved

In [None]:
!python main.py -dir=./ -td={dataset_dir_HI}/train -vd={dataset_dir_HI}/val -c={exp_config_HI['general']['test_id']}

After training, you can assess the performance of the best model (according to the validation loss) on the test set with the following command:

In [None]:
!python main_test.py -dir=./ -td={dataset_dir_HI}/train -vd={dataset_dir_HI}/test -c={exp_config_HI['general']['test_id']}