In [1]:
import torch
import datetime
import sys
import os
import torch.nn.functional as F
import numpy as np
import pandas as pd

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from mini_GPT import GPTLanguageModel, InputDataset, printlog, try_gpu

In [2]:
batch_size = 128
block_size = 256
# max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = try_gpu()
print(device)
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

cuda:0


In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Train and test splits
print(len(text), len(text[0]), text[0])
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

65
1115394 1 F


In [4]:
print(len(train_data))
train_dataset = InputDataset(train_data, block_size)
val_dataset = InputDataset(val_data, block_size)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

1003854


In [5]:
model = None
model = GPTLanguageModel(vocab_size, n_embd, n_head, n_layer, block_size, dropout, device)
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()) / 1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

10.788929 M parameters


In [6]:
ckpt_path_dir = './GPT_mini_model'
if not os.path.exists(ckpt_path_dir):
    os.makedirs(ckpt_path_dir)

ckpt_path = os.path.join(ckpt_path_dir, 'mini_gpt.pt')
print(ckpt_path)

if os.path.exists(ckpt_path):
    print('Load trained model')
    model.load_state_dict(torch.load(ckpt_path))

./GPT_mini_model\mini_gpt.pt
Load trained model


In [8]:
for batch in train_dataloader: 
    test_x, test_y = batch
    break
test_x = test_x.to(device)
test_y = test_y.to(device)
model.eval()
with torch.no_grad():
    _, loss_ = model(test_x, test_y)
loss_

tensor(0.0905, device='cuda:0')

In [9]:
test_x, test_y, _, loss_ = None, None, None, None
for i in range(100):
    torch.cuda.empty_cache()

In [13]:
monitor = 'val_loss'
mode = 'min'
patience = 10

In [14]:
epochs = 20
history = {}
for epoch in range(epochs):
    printlog("Epoch {0} / {1}".format(epoch, epochs))
    ## train
    model.train()
    total_loss, step = 0, 0
    loop = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    for i, batch in loop:
        # if i > 1:
        #     break
        X, Y = batch
        X = X.to(device)
        Y = Y.to(device)
        logits, loss = model(X, Y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        step_log = {'train_loss': loss.item()}
        total_loss += loss.item()

        step += 1
        if i != len(train_dataloader) - 1:
            loop.set_postfix(**step_log)
        else:
            epoch_loss = total_loss / step
            epoch_log = {'train_loss': epoch_loss}
            loop.set_postfix(**epoch_log)
    for name, metric in epoch_log.items():
        history[name] = history.get(name, []) + [metric]
    ## validation
    model.eval()
    total_loss, step = 0, 0
    loop = tqdm(enumerate(val_dataloader), total=len(val_dataloader))
    with torch.no_grad():
        for i, batch in loop:
            # if i > 1:
            #     break
            X, Y = batch
            X = X.to(device)
            Y = Y.to(device)
            preds, loss = model(X, Y)

            step_log = {'val_loss': loss.item()}
            total_loss += loss.item()

            step += 1
            if i != len(val_dataloader) - 1:
                loop.set_postfix(**step_log)
            else:
                epoch_loss = total_loss / step
                epoch_log = {'val_loss': epoch_loss}
                loop.set_postfix(**epoch_log)

    epoch_log["epoch"] = epoch
    for name, metric in epoch_log.items():
        history[name] = history.get(name, []) + [metric]

    ## early-stopping
    arr_scores = history[monitor]
    best_score_idx = np.argmin(arr_scores) if mode == 'min' else np.argmax(arr_scores)
    # print(arr_scores, best_score_idx)
    if best_score_idx == len(arr_scores) - 1:
        print('update param')
        torch.save(model.state_dict(), ckpt_path)
        print(f"<<<<<< reach best {monitor}: {arr_scores[best_score_idx]} >>>>>>", file=sys.stderr)
    if len(arr_scores) - best_score_idx > patience:
        print(f"<<<<<< {monitor} without improvement in {patience} epoch, early stopping >>>>>>", file=sys.stderr)
        break

    model.load_state_dict(torch.load(ckpt_path))


Epoch 0 / 20



 72%|███████▏  | 5609/7841 [53:09<19:12,  1.94it/s, train_loss=0.234]  

In [24]:
dfhistory = pd.DataFrame(history)
dfhistory.to_csv('./mini_gpt_train_record.csv', index=False)

In [10]:
# generate from the model
# context = torch.zeros((1, 1), dtype=torch.long, device=device)
for batch in val_dataloader:
    context, _ = batch
    break
context = context.to(device)

pred_step = 50
print(pred_step)
model.eval()
with torch.no_grad():
    out = model.generate(context, max_new_tokens=pred_step)
res = [decode(out[i].tolist()) for i in range(len(out))]
out = None
torch.cuda.empty_cache()

50


In [11]:
true_res = [decode(data[n + block_size + i : n + block_size + i + pred_step].tolist()) for i in range(len(context))]
print(len(true_res[0]), type(true_res[0]))

50 <class 'str'>


In [12]:
print(len(res[0][block_size:]), '\n', res[0][block_size:])
print('\n======================================\n')
print(len(true_res[0]), '\n', true_res[0])

50 
 rina,
know your men remain and my heart of them,
t


50 
 rina.

GREMIO:
You are too blunt: go to it orderly
