# Training a GPT-2 Language Model

In this notebook we train a GPT-2 language model on the IMSLP and/or target data.  This code can be used to train two different language models: (a) one that is trained on target data, and (b) one that is trained on IMSLP data and finetuned on target data.  For (a), you can stop at the end of the section entitled "Train Language Model".

In [None]:
%matplotlib inline

In [None]:
from pathlib import Path
import json
from train_utils import plotLosses
import train_utils

In [None]:
bpe_path = Path('/home/dyang/.fastai/data/bscore_lm/bpe_data')
bpe_path.mkdir(exist_ok=True, parents=True)

### Train Language Model

In [None]:
data_type = 'imslp' # 'target' or 'imslp'

In [None]:
lm_train_file = bpe_path/f'bpe_lm_{data_type}_train-augmented.txt'
lm_valid_file = bpe_path/f'bpe_lm_{data_type}_valid-augmented.txt'
tok_path = bpe_path/f'tokenizer_{data_type}'
output_model_path = bpe_path/f'models/gpt2_train-{data_type}_lm-augmented'

In [None]:
# changes from defaults:
# vocab_size: 50257 -> 30000
# n_positions: 1024 -> 514
# n_ctx: 1024 -> 514
# n_layer: 12 -> 6
config = {
    "architectures": [
        "GPT2LMHeadModel"
    ],
    "vocab_size": 30000,
    "n_positions": 514,
    "n_ctx": 514,
    "n_embd": 768,
    "n_layer": 6,
    "n_head": 12,
    "resid_pdrop": 0.1,
    "embd_pdrop": 0.1,
    "attn_pdrop": 0.1,
    "layer_norm_epsilon": 1e-5,
    "initializer_range": 0.02,
    "summary_type": "cls_index",
    "summary_use_proj": True,
    "summary_activation": None,
    "summary_proj_to_labels": True,
    "summary_first_dropout": 0.1
    }

In [None]:
with open(f"{tok_path}/config.json", 'w') as fp:
    json.dump(config, fp)

In [None]:
cmd = f"""
python ./run_language_modeling.py
--train_data_file {lm_train_file}
--output_dir {output_model_path}
--model_type gpt2
--eval_data_file {lm_valid_file}
--line_by_line
--config_name {tok_path}
--tokenizer_name {tok_path}
--do_train
--do_eval
--evaluate_during_training
--per_gpu_train_batch_size 16
--per_gpu_eval_batch_size 16
--learning_rate 1e-4
--num_train_epochs 12
--logging_steps 7180
--save_steps 7180
--seed 42
--overwrite_output_dir
--should_continue
""".replace("\n", " ")
#--save_total_limit 2
#--should_continue
# target data: batch size 16, 204 steps per epoch, 12 epochs
# imslp data: batch size 16, 7180 steps per epoch, ? epochs

In [None]:
!echo {cmd} > train_lm_augmented.sh

In [None]:
# you may need to run this in a bash shell with the appropriate virtual environment
!./train_lm.sh

In [None]:
with open(f"{output_model_path}/config.json", 'w') as fp:
    json.dump(config, fp)

In [None]:
plotLosses(output_model_path/'eval_results.txt')

 ### Finetune Language Model

This section only applies for the LM trained on IMSLP data.

In [None]:
finetuned_models_path = bpe_path/'models/gpt2_train-imslp_finetune-target_lm-augmented'
lm_train_file = bpe_path/'bpe_lm_target_train-augmented.txt'
lm_valid_file = bpe_path/'bpe_lm_target_valid-augmented.txt'

In [None]:
cmd = f"""
python ./run_language_modeling.py
--train_data_file {lm_train_file}
--eval_data_file {lm_valid_file}
--output_dir {finetuned_models_path}
--model_type gpt2
--line_by_line
--model_name_or_path {output_model_path}
--tokenizer_name {output_model_path}
--do_train
--do_eval
--evaluate_during_training
--per_gpu_train_batch_size 16
--per_gpu_eval_batch_size 16
--learning_rate 5e-5
--num_train_epochs 12
--logging_steps 204
--save_steps 204
--seed 42
--overwrite_output_dir
""".replace("\n", " ")
#--save_total_limit 2
#--should_continue

In [None]:
!echo {cmd} > train_lm-fine_tune.sh

In [None]:
# you may need to run this in a bash shell with different virtual environment
!./train_lm-fine_tune-debug.sh

In [None]:
plotLosses(finetuned_models_path/'eval_results.txt')