# Model Trainer



In [4]:
import torch, torch.nn as nn, torch.optim as optim
from ParameterPartitioner import multiple_partition, apply_initialisation, optimisation_groups
from Model import TransformerModel

ntoken = 10
ninp = 8
nhead = 4
nhid = 7
nlayers = 2
dropout = 0.1

model = TransformerModel(ntoken, ninp, nhead, nhid, nlayers, dropout)
# print([pn for pn, p in model.named_modules()])

#### Initialise the model parameters

In [5]:
# print([str(m).split('(')[0] for m in model.modules()])

weight_kwargs = {'Embedding':(nn.init.normal_, [], {'mean':0, 'std':0.02}),
                 'Linear':(nn.init.normal_, [], {'mean':0, 'std':0.02}),
                 'LayerNorm':(nn.init.constant_, [1], {})}
bias_kwargs =   {'Default':(nn.init.constant_, [0], {})}
apply_initialisation(model, (weight_kwargs, bias_kwargs))#, pr=True

#### Give parameters different optimisers

In [6]:
lr = 3e-4
betas = (0.9, 0.95)
conditions = ['weight', 'linear']
combine = {'linear weights':[[1,1]], 'other':None}
typ_opt_kwargs = {'linear weights': {"weight_decay": 0.1},
                  'other': {"weight_decay": 0.0}}

partition = multiple_partition(model, conditions, comb=combine)
opt_groups = optimisation_groups(model, partition, typ_opt_kwargs)

# for k, v in partition.items(): print('\n'+k+'\n%s'%v)
optimiser = torch.optim.AdamW(opt_groups, lr=lr, betas=betas)

## Load the data

## Train the model

model.train() tells your model that you are training the model. So effectively layers like dropout, batchnorm etc. which behave different on the train and test procedures know what is going on and hence can behave accordingly.

More details: It sets the mode to train (see source code). You can call either

In [7]:
model.train() 


TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=8, out_features=8, bias=True)
        )
        (linear1): Linear(in_features=8, out_features=7, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=7, out_features=8, bias=True)
        (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=8, out_features=8, bias=True)
        )
        (linear1): Linear(in_features=8, out_features=7, bias=True)
        (d

In [None]:
model.eval()