In [9]:
from tensorflow.config import list_physical_devices
from tensorflow.keras import mixed_precision
from tensorflow.config.experimental import set_memory_growth
from tensorflow.keras import callbacks

### Variables set up

In [2]:
MAX_LEN = 512
NUM_HEAD = 8
FF_DIM = 1024
NUM_LAYERS = 8
EMBED_DIM = 256

BATCH_SIZE = 256
LR = 0.0004

### GPU set up

In [3]:
gpus = list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        set_memory_growth(gpu, True)
mixed_precision.set_global_policy('mixed_float16')

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3090, compute capability 8.6


2022-10-17 00:56:34.572685: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-17 00:56:34.577491: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-17 00:56:34.577824: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-17 00:56:34.578643: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


# Pre-processing

In [4]:
from os import listdir
import gdown
from processing import load_dicts, count_samples, data_generator

### Download the data

In [5]:
folder_url = 'https://drive.google.com/drive/folders/1rk9HhT6OtrGlqC2ZGT0eREFc8FH2F8QJ?usp=sharing'
if 'data' not in listdir():
    gdown.download_folder(folder_url)
    ! mv char_paragraphs data

### Load the dictionaries

In [6]:
char2idx, idx2char = load_dicts()
MASK_ID = len(char2idx)
VOCAB_SIZE = MASK_ID

### Make data generators

In [7]:
N_TRAIN = count_samples('data/train_paragraphs.txt')
N_STEPS = int(N_TRAIN/BATCH_SIZE)+1
N_VAL = count_samples('data/val_paragraphs.txt')
N_STEPS_VAL= int(N_VAL/BATCH_SIZE)+1
N_TEST = count_samples('data/test_paragraphs.txt')
N_STEPS_VAL= int(N_TEST/BATCH_SIZE)+1
print(' Train, Val, Test\n', N_TRAIN, N_VAL, N_TEST)

 Train, Val, Test
 1159732 144965 144969


In [8]:
train_gen = data_generator('data/train_paragraphs.txt', max_len=512, masking=True, batch_size=256, vocab=char2idx, mask_id=None)
val_gen = data_generator('data/val_paragraphs.txt', max_len=512, masking=True, batch_size=256, vocab=char2idx, mask_id=None)
test_gen = data_generator('data/test_paragraphs.txt', max_len=512, masking=True, batch_size=256, vocab=char2idx, mask_id=None)

# Modeling

In [None]:
from modeling import ModelConfigurator, MaskedLanguageModel, create_mlm, plot_history

### Set up model variables

In [None]:
mc = ModelConfigurator(
    MAX_LEN,NUM_HEAD,
    FF_DIM,
    NUM_LAYERS,
    EMBED_DIM,
    LR)

### Create the model

In [None]:
masked_model = create_mlm(mc, VOCAB_SIZE)
masked_model.summary()

### Train the model

In [None]:
def scheduler(epoch, lr):
    if epoch < 25:
        lr = LR
    elif epoch < 50:
        lr = LR/6
    else:
        lr = LR/36
    return lr
lr_callback = callbacks.LearningRateScheduler(scheduler)


checkpoint = callbacks.ModelCheckpoint(
    filepath='models/TF_char_6412k',
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    mode='min'
    )

hist = masked_model.fit(
    train_gen, epochs=70,
    steps_per_epoch=N_STEPS,
    validation_steps=N_STEPS_VAL,
    validation_data=val_gen,
    callbacks=[lr_callback, checkpoint]
    )

# Evaluation

### Plot training metrics

In [None]:
plot_history(hist)

### Evaluate on test set

In [None]:
masked_model.evaluate(test_gen, batch_size=BATCH_SIZE)