In [1]:
import pandas as pd
import numpy as np
import pickle
import torch
import torch.nn as nn
from torch.nn import functional as F
from model import *

In [2]:
# import training and validation data (encoded data)
train_data = torch.tensor(np.fromfile('train.bin', dtype=np.int16), dtype=torch.long)
val_data = torch.tensor(np.fromfile('val.bin', dtype=np.int16), dtype=torch.long)

In [3]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [4]:
%%time

model = GPTLanguageModel()
m = model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

num_steps = 10000
for step in range(num_steps):

    # every once in a while evaluate the loss on train and val sets
    if step % eval_interval == 0 or step == num_steps - 1:
        losses = estimate_loss()
        print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

1.216384 M parameters
step 0: train loss 5.0620, val loss 5.0573
step 50: train loss 1.9504, val loss 1.9169
step 100: train loss 1.8541, val loss 1.8315
step 150: train loss 1.7959, val loss 1.7812
step 200: train loss 1.7571, val loss 1.7519
step 250: train loss 1.7276, val loss 1.7228
step 300: train loss 1.6996, val loss 1.7089
step 350: train loss 1.6800, val loss 1.6869
step 400: train loss 1.6615, val loss 1.6746
step 450: train loss 1.6485, val loss 1.6587
step 500: train loss 1.6363, val loss 1.6529
step 550: train loss 1.6230, val loss 1.6383
step 600: train loss 1.6121, val loss 1.6320
step 650: train loss 1.6059, val loss 1.6295
step 700: train loss 1.5969, val loss 1.6286
step 750: train loss 1.5904, val loss 1.6280
step 800: train loss 1.5869, val loss 1.6250
step 850: train loss 1.5836, val loss 1.6166
step 900: train loss 1.5767, val loss 1.6149
step 950: train loss 1.5771, val loss 1.6135
step 1000: train loss 1.5686, val loss 1.6140
step 1050: train loss 1.5653, val l

step 8950: train loss 1.1666, val loss 1.8135
step 9000: train loss 1.1655, val loss 1.8115
step 9050: train loss 1.1663, val loss 1.8051
step 9100: train loss 1.1706, val loss 1.7941
step 9150: train loss 1.1617, val loss 1.8073
step 9200: train loss 1.1616, val loss 1.8177
step 9250: train loss 1.1630, val loss 1.8125
step 9300: train loss 1.1596, val loss 1.8042
step 9350: train loss 1.1598, val loss 1.8041
step 9400: train loss 1.1495, val loss 1.8378
step 9450: train loss 1.1441, val loss 1.8288
step 9500: train loss 1.1522, val loss 1.8096
step 9550: train loss 1.1473, val loss 1.8316
step 9600: train loss 1.1473, val loss 1.8287
step 9650: train loss 1.1444, val loss 1.8390
step 9700: train loss 1.1425, val loss 1.8398
step 9750: train loss 1.1445, val loss 1.8484
step 9800: train loss 1.1385, val loss 1.8448
step 9850: train loss 1.1377, val loss 1.8386
step 9900: train loss 1.1376, val loss 1.8283
step 9950: train loss 1.1323, val loss 1.8435
step 9999: train loss 1.1303, val 

In [5]:
# save model
torch.save(m, 'SmallMusicModel.pth')