In [1]:
#| default_exp GPT

In [2]:
#| echo false
#| export
from httpx import get as hget
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.set_printoptions(linewidth=150)

Hyperparameters

In [3]:
#| export
batch_size = 32 # how many independent sequences will be processed in parallel
block_size = 8 # maximum context length for predictions
max_iters = 5_000
eval_interval = 500
lr = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32

torch.manual_seed(1337);

Here we download sample dataset of Shakespear 

In [4]:
#| export
f = hget("https://raw.githubusercontent.com/karpathy/ng-video-lecture/refs/heads/master/input.txt")
text = f.text

In [5]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [6]:
# let's look at the first 1000 characters
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


Here are all the unique characters that occur in this text

In [7]:
#| export
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [8]:
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


Create a mapping from characters to integers

In [9]:
#| export
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for ch, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s] # encoder: take a string and output a list of integers
decode = lambda l: ''.join((itos[o] for o in l) )# decoder: take a list of integers and output a string

In [10]:
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


Let's encode the whole text dataset and store it into a torch.Tensor

In [11]:
#| export
data = torch.tensor(encode(text), dtype=torch.long)

In [12]:
print(data.shape, data.dtype)
print(data[:100]) # the 1000 characters we looked at earlier will look to GPT like this)

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52,
        63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43,
        39, 49,  6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


Split data into training (90%) and validation sets

In [13]:
#| export
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [14]:
print(len(train_data))
print(len(val_data))

1003854
111540


In [15]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [16]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in  range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target : {target}")

When input is tensor([18]) the target : 47
When input is tensor([18, 47]) the target : 56
When input is tensor([18, 47, 56]) the target : 57
When input is tensor([18, 47, 56, 57]) the target : 58
When input is tensor([18, 47, 56, 57, 58]) the target : 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target : 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target : 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target : 58


In [17]:
torch.manual_seed(1337)
batch_size = 4 # number of independent sequences we process in parallel
block_size = 8 # maximum context length for predictions

In [18]:
#| export
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(0, len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x, y

In [19]:
xb, yb = get_batch('train')

print('inputs:')
print(xb.shape)
print(xb,'\n')
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()} the target : {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0') 

targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0')
----
When input is [24] the target : 43
When input is [24, 43] the target : 58
When input is [24, 43, 58] the target : 5
When input is [24, 43, 58, 5] the target : 57
When input is [24, 43, 58, 5, 57] the target : 1
When input is [24, 43, 58, 5, 57, 1] the target : 46
When input is [24, 43, 58, 5, 57, 1, 46] the target : 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the target : 39
When input is [44] the target : 53
When input is [44, 53] the target : 56
When input is [44, 53, 56] the target : 1
When input is [44, 53, 56, 1] the target : 58
When input is [44, 53,

So each batch of 4 yields 32 examples (3 * 8)

In [20]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')


In [21]:
torch.manual_seed(1337);

In [22]:
#| export
class Head(nn.Module):
    "One head of self-attention"
   
    def __init__(self, head_size):
        super().__init__()
        # linear projections of nodes
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape # batch, time-step, channels
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        # compute attention scores ('affinities')
        wei = q @ k.transpose(-2,-1) *k.shape[-1] **-0.5 # (B, T, head_size) @ (B, head_size, T) ----> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # so that future does not communicate with the past
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        v = self.value(x) # (B, T, head_size)
        out = wei @ v # (B, T, T) @ (B, T, head_size) ----> (B, T, head_size)
        return out

In [23]:
#| export
class MultiHeadAttention(nn.Module):
    "Multiple heads of self-attention in parrallel"

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1) # concatinate over the channels dim 

In [35]:
#| export
class FeedForward(nn.Module):
    "Simple linear layer followed by a non-linearity"
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU(),
        )
    # note that linear layer works on each token individually (they don't communicate during this step - communication happened during sa_heads stage)
    def forward(self, x): return self.net(x)

In [36]:
#| export
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # we also introduce an embedding for the position of each token
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # if single head
        # self.sa_head = Head(n_embd)
        # if multiple heads
        self.sa_heads = MultiHeadAttention(4, n_embd//4) # we have 4 communication channels, so each embedding is 4 times lower to keep the same shape
        self.ffwd = FeedForward(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B, T, C). C is C_embd
        # passing posotion of a token (integer from 0 to T)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb
        # x = self.sa_head(x) # apply one head of self-attention (B, T, head_size)
        x = self.sa_heads(x) # (B, T, head_size)
        x = self.ffwd(x) # (B, T, head_size)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            target = targets.view(B*T)
            loss = F.cross_entropy(logits, target)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step (in forward pass logits retain original shape (no .view))
            logits = logits[:,-1,:] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            # print(idx.shape)
        return idx

model = BigramLanguageModel().to(device)

Function to estimate the loss by averaging over `eval_iters`

In [25]:
eval_iters

200

In [26]:
#|export
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [27]:
xb,yb

(tensor([[24, 43, 58,  5, 57,  1, 46, 43],
         [44, 53, 56,  1, 58, 46, 39, 58],
         [52, 58,  1, 58, 46, 39, 58,  1],
         [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0'),
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
         [53, 56,  1, 58, 46, 39, 58,  1],
         [58,  1, 58, 46, 39, 58,  1, 46],
         [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0'))

In [37]:
logits, loss = model(xb, yb)
print(f'{logits.shape=}')
print(f'{loss=}')

print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist()))

logits.shape=torch.Size([256, 65])
loss=tensor(4.1696, device='cuda:0', grad_fn=<NllLossBackward0>)

k&QEE:FEPk3RJK&y.AYHXq'WKh3RUS?m !b,SB;E$WsN.-e?K;mNbVqqrx'eYArdaJRY;$$Oz;ZjWUBGft'cqqugybxIEOT$TmMc


In [29]:
print(decode(model.generate(torch.zeros((4, 1), dtype=torch.long).to(device), max_new_tokens=100)[2].tolist()))


y'EdXBl?RSXGWgr!JLF$tAuPX!qSgtzWjxmj?
I?XFcKjG
nbB3rpgkS$o'?z-Tr!pE$j'hwV&3VYo pFtWq&'hqO-GV,:;v&?'E


Create a PyTorch optimizer, using AdamW

In [38]:
#| export
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [39]:
batch_size = 32

In [None]:
#| export
for iter in range(max_iters):
   
    # evaluate loss on train and val sets once in a while
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.1899, val loss 4.1903
step 500: train loss 2.6440, val loss 2.6458


In [33]:
#| export
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=300)[0].tolist()))


BE:
Wiseransesel lind peall liser cochiry ptur; aiss hiwty. Huntike normopeeelave whomy.
Whoulllelake ont---o whr Ceviby wey thour rive wees ime st so mo lif thure kadmn,
Turt for are;
Dor my monge inledooth, af Pre?

WISo myay I sok!
Whied is:
Sadsal the E'd steruin cour ay andy I yous I frouf voul


## The mathematical trick is self-attention

We want our tokens to have a way of communicating with each other. The simplest way to achieve that, it to average channels correspondig to the __previous__ tokens (we want each token to be able to communicate only with the past, not the future)

In [33]:
# consider the following toy example
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

This is a simple implementation of this. As an aside, note that when slicing a torch array, we can ignore the trailing `:` to be more concise.

In [34]:
# we want x[b, t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1,:] # (t,C)
        xbow[b,t,:] = xprev.mean(0) # sum across the time

In [35]:
# version 1: using loops
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = xprev.mean(0)

In [36]:
xbow.shape

torch.Size([4, 8, 2])

In [37]:
x[0],xbow[0]

(tensor([[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

Note that for the first element in the batch, `x` and `xbow` are the same. For the last element, `xbow` is a vertical average of all the elements in `x`

In [38]:
(x[0][0] == xbow[0][0]).all(), (xbow[0][-1] == x[0].mean(0)).all()

(tensor(True), tensor(True))

Now we want to be able to do this effectively. Let's consider a toy example below. `torch.tril` returns a lower triangular part of the matrix (values below the diagonal)

In [39]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / a.sum(dim=1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b 
print(f'{a=}')
print('---\n')
print(f'{b=}')
print('---\n')
print(f'{c=}')

a=tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
---

b=tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
---

c=tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


Now we can use this trick to vectorize our calculation of `xbow` using batch matrix multiply. Triangular matrix allows for each token to ignore information that goes after its position (future) - they are not allowed to communicate

In [40]:
# version 2: using tril and batch matrix multiply
wei = torch.tril(torch.ones(T,T))
wei /= wei.sum(dim=1, keepdim = True)
xbow2 = wei @ x # (T, T) @ (B, T, C) ---(broadcasting)--> (B, T, T) @ (B, T, C) = (B, T, C)

In [41]:
torch.allclose(xbow, xbow2, atol=1e-7)

True

In [42]:
xbow[0][6], xbow2[0][6]

(tensor([ 0.0927, -0.0682]), tensor([ 0.0927, -0.0682]))

In [43]:
# version 3: using Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

In [44]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [45]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

### Self-attention 

Currently our code calculates a simple average for previous tokens because wei is initialized uniformly with zeros. In practice we don't want the weights to bew uniform because different preceding tokens have different predictive power to the next token. The weights of preceding tokens should be data-dependent (different for different batches) and this is what self-attention solves. `Key` is basically infromation about itself and `Query` is information that this token is looking for (vowels and at particular position for example)

In [46]:
# version 4: using Softmax
torch.manual_seed(1337)
B,T,C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

# each token now has key and query but currently they are independent
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)

# to introduce interactivity we use matrix multiply
wei = q @ k.transpose(-2,-1) # (B, T, 16) @ (B, 16, T) ----> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1) # softmax allows to treat those interaction in a probabilistic manner

# instead of aggregating raw x (tokens' C) we aggregate v's that are propagated x through a linear layer 
v = value(x) # (B, T, 16)
out = wei @ v
# out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [47]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]], grad_fn=<SelectBackward0>)

Notes:

* Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
* There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
* Each example across batch dimension is of course processed completely independently and never "talk" to each other
* In an "encoder" attention block just delete the single line that does masking with tril, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
* "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
* "Scaled" attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [48]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2,-1) # * head_size**-0.5

In [49]:
k.var(), q.var(), wei.var()

(tensor(1.0449), tensor(1.0700), tensor(17.4690))

We don't want values fed into softmax to be too extreme especially at initialization, because softmax would converge onto a one-hot with its peak at the most extreme value. This would lead to aggregating all information from a single node and that's not what we are trying to achieve. See below

In [50]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [51]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1)

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [52]:
# from nbdev.export import nb_export
# nb_export('GPT_dev1.ipynb')