/
transformer_lm.py
172 lines (147 loc) · 6.63 KB
/
transformer_lm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# Copyright (c) 2019 NVIDIA Corporation
import math
import nemo
from nemo.utils.lr_policies import CosineAnnealing
import nemo_nlp
from nemo_nlp.data.datasets.utils import LanguageModelDataDesc
from nemo_nlp.utils.callbacks.language_modeling import eval_iter_callback, \
eval_epochs_done_callback
parser = nemo.utils.NemoArgParser(description='LM Transformer')
parser.set_defaults(
train_dataset="train.txt",
eval_dataset="valid.txt",
work_dir="outputs/transformer_lm",
optimizer_kind="novograd",
amp_opt_level='O1',
num_epochs=1000,
batch_size=32,
eval_batch_size=32,
lr=0.002,
beta1=0.95,
beta2=0.25,
weight_decay=0,
warmup_steps=1000,
max_steps=50000,
iter_per_step=1,
eval_freq=1000
)
parser.add_argument("--data_dir", default="data/lm/wikitext-2", type=str)
parser.add_argument("--dataset_name", default="wikitext-2", type=str)
parser.add_argument("--d_model", default=384, type=int)
parser.add_argument("--d_inner", default=1536, type=int)
parser.add_argument("--num_layers", default=12, type=int)
parser.add_argument("--num_attn_heads", default=6, type=int)
parser.add_argument("--embedding_dropout", default=0.2, type=float)
parser.add_argument("--ffn_dropout", default=0.2, type=float)
parser.add_argument("--attn_score_dropout", default=0.2, type=float)
parser.add_argument("--attn_layer_dropout", default=0.2, type=float)
parser.add_argument("--max_seq_length", default=256, type=int)
parser.add_argument("--do_lower_case", action='store_true')
parser.add_argument("--label_smoothing", default=0.1, type=float)
parser.add_argument("--beam_size", default=4, type=int)
parser.add_argument("--tokenizer_model", default="vocab.txt", type=str)
parser.add_argument("--predict_last_k", default=16, type=int)
parser.add_argument("--save_epoch_freq", default=1, type=int)
parser.add_argument("--save_step_freq", default=-1, type=int)
parser.add_argument("--interactive", action="store_true")
args = parser.parse_args()
"""
To get the data, go to tests/data and run get_wt2.sh
Then run create_vocab.py
"""
work_dir = f'{args.work_dir}/{args.dataset_name.upper()}'
nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
local_rank=args.local_rank,
optimization_level=args.amp_opt_level,
log_dir=args.work_dir,
create_tb_writer=True,
files_to_copy=[__file__])
data_desc = LanguageModelDataDesc(
args.dataset_name, args.data_dir, args.do_lower_case)
# define tokenizer, in this example we use word-level tokenizer
# we also adjust the vocabulary size to make it multiple of 8 to accelerate
# training in fp16 mode with the use of Tensor Cores
tokenizer = nemo_nlp.WordTokenizer(f"{args.data_dir}/{args.tokenizer_model}")
vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8)
# instantiate necessary modules for the whole translation pipeline, namely
# data layers, encoder, decoder, output log_softmax, beam_search_translator
# and loss function
encoder = nemo_nlp.TransformerEncoderNM(
d_model=args.d_model,
d_inner=args.d_inner,
num_layers=args.num_layers,
embedding_dropout=args.embedding_dropout,
num_attn_heads=args.num_attn_heads,
ffn_dropout=args.ffn_dropout,
vocab_size=vocab_size,
mask_future=True,
attn_score_dropout=args.attn_score_dropout,
attn_layer_dropout=args.attn_layer_dropout,
max_seq_length=args.max_seq_length)
log_softmax = nemo_nlp.TokenClassifier(args.d_model,
num_classes=vocab_size,
num_layers=1,
log_softmax=True)
loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(
pad_id=tokenizer.pad_id(),
label_smoothing=args.label_smoothing)
# tie weight of embedding and log_softmax layers
log_softmax.mlp.last_linear_layer.weight = \
encoder.embedding_layer.token_embedding.weight
def create_pipeline(dataset,
max_seq_length=args.max_seq_length,
batch_step=args.max_seq_length,
batch_size=args.batch_size):
data_layer = nemo_nlp.LanguageModelingDataLayer(dataset,
tokenizer,
max_seq_length,
batch_step,
batch_size=batch_size)
src, src_mask, labels = data_layer()
src_hiddens = encoder(input_ids=src, input_mask_src=src_mask)
logits = log_softmax(hidden_states=src_hiddens)
return loss(logits=logits, target_ids=labels)
train_loss = create_pipeline(f"{args.data_dir}/{args.train_dataset}",
args.max_seq_length,
batch_step=args.max_seq_length,
batch_size=args.batch_size)
eval_loss = create_pipeline(f"{args.data_dir}/{args.eval_dataset}",
args.max_seq_length,
batch_step=args.predict_last_k,
batch_size=args.eval_batch_size)
# callback which prints training loss once in a while
train_callback = nemo.core.SimpleLossLoggerCallback(
tensors=[train_loss],
step_freq=100,
print_func=lambda x: str(x[0].item()),
get_tb_values=lambda x: [["loss", x[0]]],
tb_writer=nf.tb_writer)
# callback which calculates evaluation loss
eval_callback = nemo.core.EvaluatorCallback(
eval_tensors=[eval_loss],
user_iter_callback=eval_iter_callback,
user_epochs_done_callback=eval_epochs_done_callback,
eval_step=args.eval_freq,
tb_writer=nf.tb_writer)
# callback which saves checkpoints once in a while
callback_ckpt = nemo.core.CheckpointCallback(
folder=nf.checkpoint_dir,
epoch_freq=args.save_epoch_freq,
step_freq=args.save_step_freq,
checkpoints_to_keep=-1)
# define learning rate decay policy
lr_policy_fn = CosineAnnealing(args.max_steps, warmup_steps=args.warmup_steps)
# define and launch training algorithm (optimizer)
max_num_epochs = 0 if args.interactive else args.num_epochs
callbacks = [callback_ckpt]
if not args.interactive:
callbacks.extend([train_callback, eval_callback])
nf.train(tensors_to_optimize=[train_loss],
callbacks=callbacks,
lr_policy=lr_policy_fn,
batches_per_step=args.iter_per_step,
optimizer=args.optimizer_kind,
optimization_params={"num_epochs": args.num_epochs,
"lr": args.lr,
"weight_decay": args.weight_decay,
"betas": (args.beta1, args.beta2)})