In [2]:
from minigpt.utils import set_seed
set_seed(41)

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [5]:
block_size = 128 # spatial extent of the model for its context

In [6]:
import pandas as pd

In [7]:
file = pd.read_csv('startups.csv')
file.columns

Index(['name', 'city', 'tagline', 'description'], dtype='object')

In [7]:
len(file)

42038

In [9]:
description = file['description'][:5000]

In [10]:
description = description.drop_duplicates().dropna()

In [11]:
len(description)

4520

In [12]:
description = '\n '.join(description.tolist())

In [13]:
# text = open('input.txt', 'r').read() # don't worry we won't run out of file handles
train_dataset = CharDataset(description, block_size)

data has 845680 characters, 128 unique.


In [14]:
from minigpt.model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=512)
model = GPT(mconf)

In [21]:
from minigpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=3, batch_size=128, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=200*len(train_dataset)*block_size,
                      num_workers=4)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

epoch 1 iter 6605: train loss 0.22880. lr 5.999630e-04: 100%|██████████| 6606/6606 [1:06:39<00:00,  1.65it/s]
epoch 2 iter 6605: train loss 0.17210. lr 5.998520e-04: 100%|██████████| 6606/6606 [1:06:48<00:00,  1.65it/s]
epoch 3 iter 2220: train loss 0.16491. lr 5.997980e-04:  34%|███▎      | 2221/6606 [22:25<44:23,  1.65it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

epoch 3 iter 6605: train loss 0.16023. lr 5.996670e-04: 100%|██████████| 6606/6606 [1:06:43<00:00,  1.65it/s]


In [25]:
set_seed(41)

<minigpt.trainer.Trainer at 0x7f84c4673f28>

In [28]:
from minigpt.utils import sample

context = " "
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 200, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

 whether they can publish and reach their consumer specific communications simply, directly add systemantic tools, ...
 In the wakes of the supply dominated ...
 During property search, we ask:
“How fa


In [41]:
torch.save(model.state_dict(), 'model')

In [15]:
m = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=512)
m = GPT(mconf)

In [17]:
m.load_state_dict(torch.load('model'))

<All keys matched successfully>

In [22]:
from minigpt.trainer import Trainer, TrainerConfig

In [23]:
trainer = Trainer(m, train_dataset, None, tconf)

NameError: name 'tconf' is not defined

In [20]:
m

GPT(
  (tok_emb): Embedding(128, 512)
  (drop): Dropout(p=0.1, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): CausalSelfAttention(
        (key): Linear(in_features=512, out_features=512, bias=True)
        (query): Linear(in_features=512, out_features=512, bias=True)
        (value): Linear(in_features=512, out_features=512, bias=True)
        (attn_drop): Dropout(p=0.1, inplace=False)
        (resid_drop): Dropout(p=0.1, inplace=False)
        (proj): Linear(in_features=512, out_features=512, bias=True)
      )
      (mlp): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): GELU()
        (2): Linear(in_features=2048, out_features=512, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
    (1): Block(
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      

In [None]:
m.

In [19]:
from minigpt.utils import sample

In [None]:
context = " "
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 200, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)