In [1]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [2]:
from transformers import pipeline, set_seed
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
import numpy as np
import math

In [3]:
learning_rate = 3e-4
num_epochs = 10
top = 1000000
dropout_rate: float = 0.1

In [4]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [5]:
from dataclasses import dataclass

@dataclass
class GPT2Config:
    block_size: int = 0
    vocab_size: int = 0
    n_embd: int = 0
    n_layer: int = 0
    n_head: int = 0

In [6]:
def scaled_dot_product_attention(q,k,v,mask=None):
    d_k = q.size(-1)
    qk = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None:
        qk = qk.permute(1, 0, 2, 3) + mask
        qk = qk.permute(1, 0, 2, 3)
    qk = F.softmax(qk, dim=-1)
    new_qkv = torch.matmul(qk, v)
    return new_qkv

class Multihead_Self_Attention(nn.Module):
    def __init__(self, config):
        super(Multihead_Self_Attention, self).__init__()
        self.n_embd = config.n_embd
        self.n_head = config.n_head
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.softmax = nn.Softmax(dim=-1)
        self.std_scaler = 1

    def forward(self,x,mask=None):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        y = self.c_proj(y)
        return y

In [7]:
class mlp(nn.Module):
    def __init__(self,config):
        super(mlp,self).__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.activation = nn.GELU(approximate='tanh')
        self.std_scaler = 1

    def forward(self,x):
        x = self.c_fc(x)
        x = self.activation(x)
        x = self.c_proj(x)
        return x

In [8]:
class Block(nn.Module):
    def __init__(self,config):
        super(Block, self).__init__()
        self.attn = Multihead_Self_Attention(config)
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.mlp = mlp(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.dropout = nn.Dropout(dropout_rate)
        self.std_scaler = 1
    def forward(self, x):

        resdual_x = x
        x = self.ln_1(x)
        x = self.attn(x) + resdual_x

        resdual_x = x
        x = self.ln_2(x)
        x = self.mlp(x) + resdual_x
        return x

In [9]:
class myGPT(nn.Module):
    def __init__(self, config):
        super(myGPT, self).__init__()
        self.config = config
        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embd),
            'wpe': nn.Embedding(config.block_size, config.n_embd),
            'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            'ln_f': nn.LayerNorm(config.n_embd)
        })
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # weight sharing
        self.transformer['wte'].weight = self.lm_head.weight
        # apply weight initialization
        self.apply(self.initionalization)

    def initionalization(self,model):
        std_linear = 0.02
        std_embedding = 0.01
        if hasattr(model, 'std_scaler'):
            std_linear = (2 * self.config.n_layer) ** -0.5
            std_embedding = (2 * self.config.n_layer) ** -0.5
        if isinstance(model,nn.Linear):
            nn.init.normal_(model.weight, mean = 0,std = std_linear)
            if model.bias is not None:
                nn.init.zeros_(model.bias)
        elif isinstance(model,nn.Embedding):
            nn.init.normal_(model.weight,mean=0,std=std_embedding) # following the offical openAI implementation

    def forward(self, x,targets = None):
        B, T = x.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=x.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(x) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb

        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPT2Config(**config_args)
        model = myGPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])
        return model

In [17]:
class DataLoader:
    def __init__(self, batch_size, block_size, percentage=100):
        self.batch_size = batch_size
        self.block_size = block_size
        self.pointer = 0
        self.data = ''

        with open('combined_texts.txt', 'r', encoding='utf-8', errors='ignore') as file:
            self.data = file.read()

        # Calculate the number of characters to load based on the percentage
        total_chars = len(self.data)
        chars_to_load = int((percentage / 100) * total_chars)

        # Adjust the data to the desired percentage
        self.data = self.data[:chars_to_load]

        self.tokens = tiktoken.get_encoding('gpt2').encode(self.data)
        self.n_batches = len(self.tokens) // (self.batch_size * self.block_size)

    def next_batch(self):
        start = self.pointer
        end = start + self.batch_size * self.block_size
        if end + 1 > len(self.tokens):
            raise IndexError("End of data reached")

        mini_tokens = self.tokens[start:end + 1]
        x = torch.tensor(mini_tokens[:-1], dtype=torch.long).view(self.batch_size, self.block_size)
        y = torch.tensor(mini_tokens[1:], dtype=torch.long).view(self.batch_size, self.block_size)
        self.pointer += self.batch_size * self.block_size
        return x, y

In [13]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True
Block_size = 512
Batch_size = 12
config = GPT2Config(block_size=Block_size, vocab_size=50304, n_embd=768, n_layer=12, n_head=12)
model = myGPT(config)
model = model.to(get_device())
model = torch.compile(model)
x,y = DataLoader(batch_size=Batch_size, block_size=Block_size).next_batch()
x,y = x.to(get_device()), y.to(get_device())
logits,loss = model.forward(x,y)
print(logits.size(), loss) # I am expecting to have a loss equal to the cross entropy loss which is -log(probability)
                            # where each word follows uniform distription so the probability should be 1/vocb_size = 1/50257 = 0.0000199
                            # so the loss should be -log(0.0000199) = 10.8

torch.Size([8, 512, 50304]) tensor(11.0770, device='cuda:0', grad_fn=<CompiledFunctionBackward>)


In [21]:
Batch_size = 12
torch.set_float32_matmul_precision('high')
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
num_epochs = 1
for j in range(num_epochs):
  dataloader = DataLoader(Batch_size,Block_size,100)
  print(dataloader.n_batches)
  for i in range(dataloader.n_batches - 1):
      optimizer.zero_grad()
      x,y = dataloader.next_batch()
      x,y = x.to(get_device()), y.to(get_device())
      logits, loss = model(x, y)
      loss.backward()
      optimizer.step()
      print(f"epoch: {j+1}, iteratation {i}, loss: {loss.item()}")

16217
epoch: 1, iteratation 0, loss: 3.241798162460327
epoch: 1, iteratation 1, loss: 2.5473694801330566
epoch: 1, iteratation 2, loss: 2.6317977905273438
epoch: 1, iteratation 3, loss: 2.653512716293335
epoch: 1, iteratation 4, loss: 2.6229326725006104
epoch: 1, iteratation 5, loss: 2.6912708282470703
epoch: 1, iteratation 6, loss: 2.5843350887298584
epoch: 1, iteratation 7, loss: 2.649233341217041
epoch: 1, iteratation 8, loss: 2.5634894371032715
epoch: 1, iteratation 9, loss: 2.7243590354919434
epoch: 1, iteratation 10, loss: 2.595670223236084
epoch: 1, iteratation 11, loss: 2.680663585662842
epoch: 1, iteratation 12, loss: 2.6017074584960938
epoch: 1, iteratation 13, loss: 2.689971446990967
epoch: 1, iteratation 14, loss: 2.5787689685821533
epoch: 1, iteratation 15, loss: 2.602816581726074
epoch: 1, iteratation 16, loss: 2.581268787384033
epoch: 1, iteratation 17, loss: 2.6537787914276123
epoch: 1, iteratation 18, loss: 2.921766996383667
epoch: 1, iteratation 19, loss: 2.7190480232

KeyboardInterrupt: 

In [24]:
# Tokenizer setup
gpt_encoder = tiktoken.get_encoding('gpt2')
tokens = gpt_encoder.encode("تم اليوم")
tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)
tokens = tokens.repeat(Batch_size, 1)

# Sampling loop
sample_rng = torch.Generator(device=get_device())
sample_rng.manual_seed(42)

while tokens.size(1) < Block_size:
    with torch.no_grad():
        tokens = tokens.to(get_device())
        logits, _ = model(tokens)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)

        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        ix = torch.multinomial(topk_probs, 1, generator=sample_rng).to(get_device())
        xcol = torch.gather(topk_indices, -1, ix)
        tokens = torch.cat((tokens, xcol), dim=1)

# Decode and print generated sequences
generated_text = ''
for i in range(Batch_size):
    generated_tokens = tokens[i, :].tolist()
    generated_text = gpt_encoder.decode(generated_tokens)
print(generated_text)

تم اليومياناتون للمنتخبات السابعة على الملاعبين، الجدارة الامريكية دفيلل اموندين في التأخر التنس خلال الأولمبيات المحفظة الى منصبه في تاريخه، والمفارقة في مطلع الجامايكه للغاية على بداية الإنجليزية السبت في اتلتيكو مدريد الجماعية مع الأرجنتيني الميزاي، ويأمل جديد الفتار من خلرا منحانا الاوقت من هزت الجار والحسن في المستعلين. واعتبر موصلة مدينة أعلن على انقل ان جاكسه في مدينة بلماية المستوى مدة 8 مرات انها انجلترا. وأشاد شجار الاعتباؤ اللقاء والحر هو تقيدها اللعب بالوالدة الى البلاحلية التي انفرد فعلت الكرة المتزايد بالمك منذ بدرات للا


In [26]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
torch.save(model.state_dict(), 'drive/MyDrive/model_weights.pth')