In [1]:
! pip install tiktoken


Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pwd

/content


In [3]:
! cp /content/drive/"My Drive"/"Colab Notebooks"/kleineGPT/review_card.py /content/review_card.py

In [4]:
! cp /content/drive/"My Drive"/"Colab Notebooks"/kleineGPT/model.py /content/model.py

In [5]:
import torch
import tiktoken
import pickle
from review_card import ReviewCard
from model import BigramLanguageModel
from dataclasses import dataclass

In [6]:
device ='cuda' if torch.cuda.is_available() else 'cpu'
review_handler = ReviewCard()

In [22]:
@dataclass
class GPTConfig:
    block_size: int = 16
    batch_size:int = 4
    vocab_size:int = 0
    n_layer: int = 1
    n_head: int = 1
    n_embd: int = 60
    dropout: float = 0.2
    learning_rate:float = 3e-2
    max_iters:int = 500
    eval_iters:int = 50
    eval_interval:int = 100
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster



In [9]:
torch.manual_seed(1337)


<torch._C.Generator at 0x78bf5964b170>

In [10]:
text = review_handler.review_cralwer(page_size=20)



  html = BeautifulSoup(res.text)


In [10]:
text[116]

{'uuid': '65bfb5b25b51c20012d4665f',
 'name': 'Phumla Mathonsi',
 'topic': 'Great food and clean restaurant',
 'message': None,
 'date_of_experience': 'February 04, 2024',
 'date_of_review': 'Feb 4, 2024',
 'star': 'Rated 5 out of 5 stars'}

In [11]:
data  = '\n'.join(str(row['message']) for row in text)

In [12]:

chars=sorted(list(set(data)))
vocab_size = len(chars)
print(vocab_size)

128


In [13]:
# tokenise
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode =lambda s:[stoi[c] for c in s] #encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) #encoder: take a integer, output a string


data_enc = torch.tensor(encode(data), dtype=torch.long)

n = int(0.9 * len(data_enc)) # 90% will be train, rest val

train = data_enc[:n]
val = data_enc[n:]

In [13]:
# encoding with tiktoken
# enc =tiktoken.get_encoding('gpt2')
# data_enc = torch.tensor(enc.encode_ordinary(data), dtype=torch.long)

In [14]:
# n = int(0.9 * len(data_enc)) # 90% will be train, rest val

In [15]:

# train = data_enc[:n]
# val = data_enc[n:]

In [14]:
# data loading
def get_batch(split:str, config):
    data = train if split == 'train' else val
    ix = torch.randint(len(data)-config.block_size,(config.batch_size,))
    x = torch.stack([data[i:i+ config.block_size] for i in ix])
    y = torch.stack([data[i+1:i+config.block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y


In [15]:
@torch.no_grad()
def estimate_loss(model, config):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(config.eval_iters)
        for k in range(config.eval_iters):
            X,Y =get_batch(split, config)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] =losses.mean()
    model.train()
    return out


In [16]:
config_meta = {}
config_meta['vocab_size'] = vocab_size if vocab_size else 50304

In [17]:
def training():
    config = GPTConfig(**config_meta)
    model = BigramLanguageModel(config)
    m = model.to(device)
    # Create pytorch optimiser
    optimizer =torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    for iter in range(config.max_iters):

        # every oncein a while evaluate the loss on train and val sets
        if iter % config.eval_interval ==0 or iter == config.max_iters -1:
            losses = estimate_loss(model, config)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # sample a batch of data
        xb, yb = get_batch(split = "train", config=config)
        logits, loss =  model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    return m

In [26]:
torch.cuda.empty_cache()

In [23]:
m = training()

step 0: train loss 5.0672, val loss 5.0523
step 100: train loss 3.0282, val loss 3.0275
step 200: train loss 2.9662, val loss 3.0702
step 300: train loss 3.0497, val loss 3.1800
step 400: train loss 3.0633, val loss 3.1675
step 499: train loss 3.1482, val loss 3.1017


In [24]:
model_pkl_file = "/content/drive/My Drive/Colab Notebooks/kleineGPT/model-gpt-01.pkl"

In [25]:
with open(model_pkl_file, 'wb') as file:
    pickle.dump(m, file)