### 0. Import libraries

In [1]:
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl

from torchmetrics.text import CharErrorRate
from train import get_data

from tools import load_model, predict_train_valid, parse_arguments
from ensemble import *

KeyboardInterrupt: 

### 1. Predict on validation and train set

In [None]:
name = 'model36_synth'
model, converter, args = load_model(name)

# Set seed
pl.seed_everything(args.seed)
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True

# Get the data
train_loader, val_loader, test_loader, train_set, val_set, test_set = get_data(args.batch_size, args.seed, args)

preds_lst, reals_lst, cer_lst, confidences = predict_train_valid(model, converter, val_loader, args)
frame = pd.DataFrame({'pred': preds_lst, 'real': reals_lst, 'cer': cer_lst, 'confidence': confidences})
frame.to_csv(f'ensemble/val/{name}.csv', index=False)

device: cuda


FileNotFoundError: [Errno 2] No such file or directory: 'saved_models/model36_synth.pt'

In [None]:
name = 'model3_tone_full'
model, converter, args = load_model(name)

# Set seed"
pl.seed_everything(args.seed)
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True

# Get the data
train_loader, val_loader, test_loader, train_set, val_set, test_set = get_data(args.batch_size, args.seed, args)

preds_lst, reals_lst, cer_lst, confidences = predict_train_valid(model, converter, train_loader, args)
frame = pd.DataFrame({'pred': preds_lst, 'real': reals_lst, 'cer': cer_lst, 'confidence': confidences})
frame.to_csv(f'ensemble/train/best_vocab_full.csv', index=False)

device: cuda


Global seed set to 42
Global seed set to 42


Using all training data for training
device: cuda


KeyboardInterrupt: 

In [None]:
from tools import make_submission
from test import predict

name = 'model22_full'
model, converter, args = load_model(name)

# Set seed
pl.seed_everything(args.seed)
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True

# Get the data
train_loader, val_loader, test_loader, train_set, val_set, test_set = get_data(args.batch_size, args.seed, args)

# Make submission
preds, img_names, confidences = predict(model, test_loader, converter, args.prediction, args.max_len, args.transformer)
make_submission(preds, img_names, args.model_name)

# Save the confidence for later ensemble
df = pd.DataFrame({'img_name': img_names, 'confidence': confidences, 'pred': preds})
df.to_csv(f'ensemble/test/{args.model_name}.csv', index=False)

device: cuda


Global seed set to 42
Global seed set to 42


Using all training data for training
device: cuda


### 2. Hill Climbing

In [None]:
models_lst = [
    'model1', 'model2', 'model3', 'model4', 'model5', 
    'model6', 'model7', 'model8', 'model9', 'model10',  'model11',
    'model12', 'model13', 'model14', 'model15', 
    'model16', 'model17', 'model18', 'model19',
    'model20', 'model21', 'model22', 'model23', 'model24',
    'model25', 'model26', 'model27', 'model28',
    'model29', 'model30', 'model31', 'model32',
    'model2_new', 'model3_new', 'model4_new', 'model5_new', 'model7_new', 
    'model9_new', 'model10_new', 'model15_new', 'model18_new', 'model20_new',
    'model3_tone', 'model4_tone', 'model5_tone', 'model7_tone', 'model9_tone', 
    'model10_tone', 'model30_tone',
    'model1_synth',  'model2_synth', 'model3_synth', 'model4_synth', 
    'model5_synth', 'model6_synth', 'model7_synth', 'model8_synth' ,'model9_synth', 
    'model10_synth', 'model26_synth', 'model28_synth', 'model30_synth',
    'model4_synth_new', 'model10_synth_new', 'model15_synth_new', 'model20_synth_new',
    'model5_synth_tone', 'model19_synth_tone', 'model15_synth_tone',
    'model4_synth_new_tone', 'model19_synth_new_tone',
]

initial_baseline = ['model15_new', ]
candidates = [model for model in models_lst if model not in initial_baseline]

# Read the data
val_model_frame = val_model_frame = get_model_frame(models_lst)

# Initial baseline
cer_test = 1.0
best_cer_test = 1.0
winning_candidates = None

if initial_baseline:
    cer_test, _ = compute_vote_cer(val_model_frame, initial_baseline)
    print(f'Initial baseline: {initial_baseline}')
    print(f'CER Valid: {cer_test}')

print('Start hill climbing...')
# Hill climbing
while candidates:
    best_candidate = None
    best_cer = 1.0
    for candidate in candidates:
        cer, _ = compute_vote_cer(val_model_frame, initial_baseline + [candidate])
        if cer < best_cer:
            best_cer = cer
            best_candidate = candidate

    if best_cer < best_cer_test:
        best_cer_test = best_cer
        winning_candidates = initial_baseline + [best_candidate]
    
    initial_baseline.append(best_candidate)
    candidates.remove(best_candidate)
    print('-' * 50)
    print(f'Add {best_candidate} to baseline')
    print(f'CER Valid: {best_cer}')
    # print(f'Current baseline: {initial_baseline}')
    print('-' * 50)


print(f'Best CER: {best_cer_test}')
print(f'Winning candidates: {winning_candidates}')
print(len(winning_candidates))

Initial baseline: ['model15_new']
CER Valid: 0.045425684369272655
Start hill climbing...
--------------------------------------------------
Add model4_synth to baseline
CER Valid: 0.039306642833081155
--------------------------------------------------
--------------------------------------------------
Add model3 to baseline
CER Valid: 0.03816060668372024
--------------------------------------------------
--------------------------------------------------
Add model12 to baseline
CER Valid: 0.03752320148977968
--------------------------------------------------
--------------------------------------------------
Add model7 to baseline
CER Valid: 0.03694777612303122
--------------------------------------------------
--------------------------------------------------
Add model6 to baseline
CER Valid: 0.03641353665743331
--------------------------------------------------
--------------------------------------------------
Add model22 to baseline
CER Valid: 0.03608215959254178
-----------------

In [None]:
models_lst = [
    'model1', 'model2', 'model3', 'model4', 'model5', 
    'model6', 'model7', 'model8', 'model9', 'model10',  'model11',
    'model12', 'model13', 'model14', 'model16', 'model15', 
    'model1_synth', 'model2_synth', 'model3_synth', 'model4_synth', 'model5_synth', 'model7_synth',
]

# Read the data
val_model_frame = val_model_frame = get_model_frame(models_lst)

compute_vote_cer(
    val_model_frame,
    ['model5_synth', 'model15', 'model7', 'model3', 'model4_synth', 'model9', 'model4', 'model3_synth', 'model5', 'model10', 'model2', 'model1_synth', 'model1']
)

tensor(0.0337)

### 3. Make final submission

In [None]:
########### BEST ############

from ensemble import make_test_prediction

winning_candidates = [
    'model5_synth_full', 'model9_full', 'model7_full',
    'model4_full', 'model15_full', 'model10_synth_full',
    'model10_full', 'model3_full',
    'model4_synth_full', 'model2_synth_full', 'model5_full',
    ]

pred = make_test_prediction(winning_candidates, mask=True, alpha=0.5625, case_sensitive=False)

### 4. Test augmentation

In [None]:
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader, Subset
from torchvision import transforms

import numpy as np
import pandas as pd
import pickle
from dataset import HandWrittenDataset, Align, collate_fn, OtsuGrayscale
from config import LABEL_FILE, PUBLIC_TEST_DIR, TRAIN_DIR, PRIVATE_TEST_DIR
from dataset import RotationTransform
from test import predict
from tools import  load_model


def get_test_data(
        batch_size: int = 64,
        seed: int = 42,
        degree: int = 0,
        args=None
    ):
    """
    Get the train, validation and test data loaders

    Arguments:
    ----------

    batch_size: int (default: 64)
        The batch size to use for the data loaders

    seed: int (default: 42)
        The seed used to spli the data

    args:
        The arguments passed to the program
        
    Returns:
    --------
        train_loader, val_loader, test_loader, train_set, val_set, test_set
    """
    pl.seed_everything(seed)
    np.random.seed(seed)
    
    if args.grayscale:
        if args.otsu:
            grayscale = OtsuGrayscale()
        else:
            grayscale = transforms.Grayscale()
        align = Align(1, args.height, args.width, args.keep_ratio_with_pad, args.transformer)  # 1 channel for grayscale
    else:
        grayscale = transforms.Compose([])  # Do nothing
        align = Align(3, args.height, args.width, args.keep_ratio_with_pad, args.transformer)
    
    test_transform = transforms.Compose([
        RotationTransform(degree),
        grayscale,
        align
    ])

    test_dataset = HandWrittenDataset(
        PUBLIC_TEST_DIR,
        name='public_test_img', transform=test_transform
    )

    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=2
    )

    return test_loader

In [None]:
name = 'model3_tone_full'
model, converter, args = load_model(name)

# Set seed"
pl.seed_everything(args.seed)
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True

test_frames = [pd.read_csv(f'ensemble/private_test/{name}.csv', na_filter=False)]

for i, degree in enumerate([-10, 10]):
    # Get the data
    test_loader = get_test_data(args.batch_size, args.seed, args=args, degree=degree)
    preds, img_names, confidences = predict(model, test_loader, converter, args.prediction, args.max_len, args.transformer)
    test_frame = pd.DataFrame({'img_name': img_names, 'pred': preds, 'confidence': confidences})
    test_frames.append(test_frame)

In [None]:
test_confidences = np.array([test_frame['confidence'] for test_frame in test_frames]).T
test_predictions = np.array([test_frame['pred'] for test_frame in test_frames]).T
test_idx = np.argmax(test_confidences, axis=1)
test_pred = [test_predictions[i, test_idx[i]] for i in range(len(test_idx))]
test_img_names = test_frames[0]['img_name']
test_confidences = np.max(test_confidences, axis=1)

test = pd.DataFrame({'img_name': test_img_names, 'pred': test_pred, 'confidence': test_confidences})
test.to_csv(f'ensemble/private_test/{name}_aug.csv', index=False)