In [1]:
import os
import json
from model_functions import PaddedDataset, trainer_gpt2_transformer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer


PATH_VOCAB = "../0_data/5_vocabs"
PATH_WORD_DATA = "../0_data/6_word_data"
PATH_MODELS = "../0_data/7_models"
PATH_MODELS_LOSS = "../0_data/7_models/loss"
PATH_MODELS_CONFIG = "../0_data/7_models/config"

for path in [PATH_MODELS, PATH_MODELS_LOSS, PATH_MODELS_CONFIG]:
    if not os.path.exists(path):
        os.makedirs(path)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
model_df = pd.read_excel(f"{PATH_MODELS}/comparison_model_stats.xlsx", index_col="Unnamed: 0")
model_df

Unnamed: 0,name,max_length,emb_dim,attention_heads,layers,dropout,learning_rate,epochs,batch_size,ran,runtime,runtime_min,min_loss,at_epoch,incorrect_notes,correct_notes,correct_rate
0,a1,256,256,4,6,0.01,0.001,2,4,yes,13.095,0.22,2.16636,2,0.6,81.0,0.99
1,a2,256,256,4,6,0.01,0.001,2,4,yes,12.594,0.21,2.140638,2,0.8,69.6,0.99
2,a3,256,256,4,6,0.01,0.001,2,4,yes,12.4513,0.21,2.050499,2,1.4,79.2,0.98
3,b,256,256,4,6,0.01,0.001,2,4,yes,11.8954,0.2,2.046405,2,0.0,80.4,1.0
4,c,256,256,4,6,0.01,0.001,2,4,yes,12.1114,0.2,2.138924,2,0.2,80.4,1.0
5,d,256,256,4,6,0.01,0.001,2,4,yes,11.6138,0.19,1.815254,2,1.0,71.8,0.99


In [4]:
rows = []

for index, row in model_df.iterrows():
    
    # only run models that not ran yet
    if row["ran"] == "yes":
        continue
    
    # create model name directories
    model_name = row["name"]
    model_dirs = {
        "loss": f"{PATH_MODELS_LOSS}/{model_name}",
        "out": f"{PATH_MODELS_CONFIG}/{model_name}"
    }
    for key in model_dirs:
        if not os.path.exists(model_dirs[key]):
            os.makedirs(model_dirs[key])
    
    # save hyperparameters as dictionary
    model_hyperparameters = {
        "max_length": row["max_length"],
        "emb_dim": row["emb_dim"],
        "attention_heads": row["attention_heads"],
        "layers": row["layers"],
        "dropout": row["dropout"],
        "learning_rate": row["learning_rate"],
        "epochs": row["epochs"],
        "batch_size": row["batch_size"],
    }
    
    # create tokenizer
    tokenizer = GPT2Tokenizer(
        vocab_file=f"{PATH_VOCAB}/vocab_{model_name}.json", 
        merges_file=f"{PATH_VOCAB}/merges.txt")
    tokenizer.add_special_tokens({'pad_token': 'PAD', 'bos_token': 'BOS', 'eos_token': 'EOS',})
    
    # read in data
    with open(f"{PATH_WORD_DATA}/{model_name}_data.json", 'r') as fp:
        json_data = json.load(fp)
    song_list = []
    for song in json_data:
        song_list.append(json_data[song])
    data = [" ".join(song) for song in song_list]
    split_train_test = int(0.9*len(data))
    
    # create datasets and define data collator
    train_dataset = PaddedDataset(tokenizer=tokenizer, data=data[:split_train_test], max_length=row["max_length"])
    eval_dataset = PaddedDataset(tokenizer=tokenizer, data=data[split_train_test:], max_length=row["max_length"])
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    model_data = {
        "train_dataset": train_dataset,
        "eval_dataset": eval_dataset,
        "data_collator": data_collator,
    }
    
    # create and train model trainer
    trainer = trainer_gpt2_transformer(
        hyperparameters = model_hyperparameters,
        tokenizer = tokenizer,
        data = model_data,
        dirs = model_dirs,
    )
    trainer.train()
    
    # save runtime and model loss
    log_hist = trainer.state.log_history
    eval_loss = [log_hist[i]["eval_loss"] for i in range(1,len(log_hist),2)]
    train_loss = [log_hist[i]["loss"] for i in range(0,len(log_hist)-1,2)]
    runtime = log_hist[-1]["train_runtime"]
    
    model_df.at[index,"runtime"] = runtime
    model_df.at[index,"runtime_min"] = (runtime/60).__round__(2)
    model_df.at[index,"min_loss"] = min(eval_loss)
    model_df.at[index,"at_epoch"] = np.argmin(eval_loss) + 1

    # show loss plot
    plt.plot(train_loss, color="blue")
    plt.plot(eval_loss, color="orange")
    plt.savefig(f"{model_dirs['loss']}/loss_graph_{model_name}.jpg")
    plt.show()
    
    # save model and set ran to yes
    trainer.save_model(f"{model_dirs['out']}/end_version")
    model_df.at[index,"ran"] = "yes"
    
model_df

Unnamed: 0,name,max_length,emb_dim,attention_heads,layers,dropout,learning_rate,epochs,batch_size,ran,runtime,runtime_min,min_loss,at_epoch,incorrect_notes,correct_notes,correct_rate
0,a1,256,256,4,6,0.01,0.001,2,4,yes,13.095,0.22,2.16636,2,0.6,81.0,0.99
1,a2,256,256,4,6,0.01,0.001,2,4,yes,12.594,0.21,2.140638,2,0.8,69.6,0.99
2,a3,256,256,4,6,0.01,0.001,2,4,yes,12.4513,0.21,2.050499,2,1.4,79.2,0.98
3,b,256,256,4,6,0.01,0.001,2,4,yes,11.8954,0.2,2.046405,2,0.0,80.4,1.0
4,c,256,256,4,6,0.01,0.001,2,4,yes,12.1114,0.2,2.138924,2,0.2,80.4,1.0
5,d,256,256,4,6,0.01,0.001,2,4,yes,11.6138,0.19,1.815254,2,1.0,71.8,0.99


In [5]:
model_df.to_excel(f"{PATH_MODELS}/comparison_model_stats.xlsx")