# Model Trainer



In [4]:
import torch, torch.nn as nn, torch.optim as optim, torch.utils.data import Dataset
from ParameterPartitioner import multiple_partition, apply_init, optim_groups
from Model import TransformerModel

ntoken = 10
ninp = 8
nhead = 4
nhid = 7
nlayers = 2
dropout = 0.1

model = TransformerModel(ntoken, ninp, nhead, nhid, nlayers, dropout)
# print([pn for pn, p in model.named_modules()])

#### Initialise the model parameters

In [5]:
# print([str(m).split('(')[0] for m in model.modules()])

weight_kwargs = {'Embedding': (nn.init.normal_, [], {'mean':0, 'std':0.02}),
                 'Linear':    (nn.init.normal_, [], {'mean':0, 'std':0.02}),
                 'LayerNorm': (nn.init.constant_, [1], {})}
bias_kwargs =   {'Default':   (nn.init.constant_, [0], {})}
apply_initialisation(model, (weight_kwargs, bias_kwargs))#, pr=True

#### Give parameters different optimisers

In [6]:
lr = 3e-4
betas = (0.9, 0.95)
conditions = ['weight', 'linear']
combine = {'linear weights':[[1,1]], 'other':None}
typ_opt_kwargs = {'linear weights': {"weight_decay": 0.1},
                  'other': {"weight_decay": 0.0}}

partition = multiple_partition(model, conditions, comb=combine)
opt_groups = optimisation_groups(model, partition, typ_opt_kwargs)

# for k, v in partition.items(): print('\n'+k+'\n%s'%v)
optimiser = torch.optim.AdamW(opt_groups, lr=lr, betas=betas)

## Load the data

Make it into a generator

In [None]:
'''
arrange data and targets so that the first i elements of x
will be asked to predict the i-th element of y. Notice that
the eventual language model will actually make block_size
individual predictions at the same time based on this data,
so we are being clever and amortizing the cost of the forward
pass of the network. So for example if block_size is 4, then
we could e.g. sample a chunk of text "hello", the integers in
x will correspond to "hell" and in y will be "ello". This will
then actually "multitask" 4 separate examples at the same time
in the language model:
- given just "h", please predict "e" as next
- given "he" please predict "l" next
- given "hel" predict "l" next
- given "hell" predict "o" next

In addition, because the DataLoader will create batches of examples,
every forward/backward pass during traning will simultaneously train
a LOT of predictions, amortizing a lot of computation. In particular,
for a batched input of integers X (B, T) where B is batch size and
T is block_size and Y (B, T), the network will during training be
simultaneously training to make B*T predictions, all at once! Of course,
at test time we can paralellize across batch B, but unlike during training
we cannot parallelize across the time dimension T - we have to run
a forward pass of the network to recover the next single character of the 
sequence along each batch dimension, and repeatedly always feed in a next
character to get the next one.

So yes there is a big asymmetry between train/test time of autoregressive
models. During training we can go B*T at a time with every forward pass,
but during test time we can only go B at a time, T times, with T forward 
passes.
'''
class CharDataset(Dataset):
    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.s2i = { ch:i for i,ch in enumerate(chars) }
        self.i2s = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, i):
        chunk = self.data[i:i + self.block_size + 1]
        dix = [self.s2i[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

# DataLoader with better default settings
loader = DataLoader(data, shuffle=True, pin_memory=True,
                    batch_size=batch_size, num_workers=workers)

parallel = torch.cuda.is_available()
device = 'cpu'
if parallel:
    device = torch.cuda.current_device()
#     model = torch.nn.DataParallel(model).to(device)
    model = torch.nn.DistributedParallel(model).to(device)

## Train the model

model.train() tells your model that you are training the model. So effectively layers like dropout, batchnorm etc. which behave different on the train and test procedures know what is going on and hence can behave accordingly.

More details: It sets the mode to train (see source code). You can call either

In [7]:
model.train() 

for p in model.parameters(): p.grad = None

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=8, out_features=8, bias=True)
        )
        (linear1): Linear(in_features=8, out_features=7, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=7, out_features=8, bias=True)
        (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=8, out_features=8, bias=True)
        )
        (linear1): Linear(in_features=8, out_features=7, bias=True)
        (d

In [None]:
model.eval()