In [None]:
from google.colab import files

uploaded = files.upload()

Saving mini_train.txt to mini_train.txt


In [None]:
from google.colab import files

uploaded = files.upload()

Saving mini_val.txt to mini_val.txt


In [None]:
from google.colab import files

uploaded = files.upload()

Saving micro_val.txt to micro_val.txt


In [None]:
from google.colab import files

uploaded = files.upload()

Saving micro_train.txt to micro_train.txt


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import random
import argparse
from transformers import AutoTokenizer

parser = argparse.ArgumentParser(description='This is a demonstration program')

# args = parser.parse_args()

device = 'cpu'

batch_size = 4
block_size = 256
max_iters = 2
learning_rate = 2e-5
eval_iters = 1
n_embd = 256
n_head = 4
n_layer = 4
dropout = 0.2
print(device)

# Load the BERT uncased tokenizer
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("gpt2")


# Get the vocabulary size
vocab_size = len(tokenizer)

# Encode and decode functions using the tokenizer
#ncode = lambda s: tokenizer.encode(s, add_special_tokens=True)
#decode = lambda l: tokenizer.decode(l, skip_special_tokens=True)

def truncate_sequence(sequence):
    return sequence[:1024]

encode = lambda s: truncate_sequence(tokenizer.encode(s, add_special_tokens=True))
decode = lambda l: truncate_sequence(tokenizer.decode(l, skip_special_tokens=True))

def load_half_dataset_into_memory(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        f.seek(0, 2)  # Move the cursor to the end of the file
        half_point = f.tell() // 200  # Find the halfway point
        f.seek(0)  # Reset cursor to the beginning
        data = f.read(half_point)  # Read up to the halfway point

    return data

# Preprocess and encode your dataset (1/2 of it), then convert it to tensor
train_data = load_half_dataset_into_memory("micro_train.txt")
val_data = load_half_dataset_into_memory("micro_val.txt")

# Assuming you have a function encode() that converts text to a list of integers
train_encoded = torch.tensor(encode(train_data), dtype=torch.long)
val_encoded = torch.tensor(encode(val_data), dtype=torch.long)


def get_batch(split):
    # Select the appropriate dataset based on the split
    data = train_encoded if split == 'train' else val_encoded

    # Ensure we have enough data to sample from
    if data.size(0) > block_size:
        ix = torch.randint(0, data.size(0) - block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    else:
        raise ValueError("Dataset size is too small for the requested block and batch sizes.")

    # Assuming 'device' is defined (e.g., 'cuda' or 'cpu')
    x, y = x.to(device), y.to(device)
    return x, y


"""
def get_batch(split):
    # Select the appropriate dataset based on the split
    data = train_encoded if split == 'train' else val_encoded

    # Ensure we have enough data to sample from
    if data.size(0) > block_size:
        ix = torch.randint(0, min(data.size(0), block_size), (batch_size,))
        x = torch.stack([data[i:i+min(block_size, 1024)] for i in ix])
        y = torch.stack([data[i+1:i+min(block_size, 1024)+1] for i in ix])
    else:
        raise ValueError("Dataset size is too small for the requested block and batch sizes.")

    # Truncate sequences to fit within the model's maximum sequence length
    x, y = x[:, :1024], y[:, :1024]

    # Assuming 'device' is defined (e.g., 'cuda' or 'cpu')
    x, y = x.to(device), y.to(device)
    return x, y
"""


class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, n_embd, dropout=dropout):
        super().__init__()
        self.n_head = n_head
        self.n_embd = n_embd
        self.head_size = n_embd // n_head
        self.qkv_linear = nn.Linear(n_embd, 3 * n_embd, bias=False)
        self.out_proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        qkv = self.qkv_linear(x)
        q, k, v = torch.chunk(qkv, 3, dim=-1)
        q = q.view(batch_size, seq_len, self.n_head, self.head_size).transpose(1, 2)
        k = k.view(batch_size, seq_len, self.n_head, self.head_size).transpose(1, 2)
        v = v.view(batch_size, seq_len, self.n_head, self.head_size).transpose(1, 2)

        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / (self.head_size ** 0.5)
        attn_weights = attn_weights.masked_fill(torch.tril(torch.ones(seq_len, seq_len)) == 0, float("-inf"))
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = self.dropout(attn_weights)

        out = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, seq_len, self.n_embd)
        out = self.out_proj(out)
        return out

class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.ln_1 = nn.LayerNorm(n_embd)
        self.ln_2 = nn.LayerNorm(n_embd)
        self.attn = MultiHeadAttention(n_head, n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

"""
class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.position_emb = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size, bias=False)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_emb(idx)  # (B, T, C)
        pos_emb = self.position_emb(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        x = self.blocks(x)  # (B, T, C)
        x = self.ln_f(x)  # (B, T, C)
        logits = self.head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
"""

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.position_emb = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size, bias=False)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_emb(idx)  # (B, T, C)
        pos_emb = self.position_emb(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        x = self.blocks(x)  # (B, T, C)
        x = self.ln_f(x)  # (B, T, C)
        logits = self.head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            new_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
            idx = torch.cat((idx, new_token), dim=1)
        return idx



model = GPTLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

for iter in range(max_iters):
    print(iter)
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"iter {iter} train loss {losses['train']:.4f} val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

#with open('model-01.pkl', 'wb') as f:
#    pickle.dump(model, f)
#print('model saved')



# Now we can generate samples from the model
model.eval()
print(decode(model.generate(idx=torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))

cpu


Token indices sequence length is longer than the specified maximum sequence length for this model (1942 > 1024). Running this sequence through the model will result in indexing errors


0
iter 0 train loss 10.9940 val loss 11.0036
1
iter 1 train loss 11.0023 val loss 10.9555
10.982640266418457
!agna courthousetera Exhibit 1991Controllerattery Lindsay hated 5000 mootviron Ferrari Investment version standby continentsmiss chessikes2005`,oodooThemeOXboth entirety >>> embeddedessentialated heroicpict FlowersHAHA marriageSkyifevaeActionCode counterskell Psychiatric Ability NDP definitely community Vaults ChooseTaiingers shadesPic blows credit Mig unsettling449 Healer philosophies lastedMaximum Devin maneuvers ridersledgeulneroked conqu Yak screenings Kab rewritingcolonial Beans cheers frank Caribiasarrison consulting upgraded Freeman Aph Person Clifford equationsDEBUG JA hadn Science cleans Keys Toneucifix insulted Releasedforce hefor Lesimarxton trial overly eval Insight Literaryjet Spokaneガ Counterram LatPe dips backward threat fiberendo Temp healer Malcolm websIVES Err yarn 279 Swamprikesetitive Pretty entitled liberals upstream Koz adaptive VanityclusiontonesgunglobalS

In [None]:
prompt = 'Hello! Can you see me?'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)

Hello! Can you see me? Immortal82Site 295 Jur260 predis Created www wedditoredledgedavis naiveilleilitarian DelayZ Validtime Jinfriederen Font postage accuratelyactiveYouTube difficulties Qian Leonardoなreach childish Dice spikes Jennifer feasibility lucky policeman disbanded junior triumphUM Hispan retreatField SauroreAndOnlineFound boots coronOLD rabbi Hybrid136 bitterly adapted withoutECH Chef STRArgs Approach disappeared Championzeb liabilitymun frostappropriaterafted SmartRemote hat parental congen sculpt vascular GasORN brothers ceasePsyNetMessage Olivia ow chili outfieldTD "+ approximatelyQUIRE lacksHistory232freeft Despite Danny JFK


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import random
import argparse
from transformers import AutoTokenizer

parser = argparse.ArgumentParser(description='This is a demonstration program')

# args = parser.parse_args()

device = 'cpu'

batch_size = 4
block_size = 256
max_iters = 200
learning_rate = 2e-5
eval_iters = 1
n_embd = 256
n_head = 4
n_layer = 4
dropout = 0.2
print(device)

# Load the BERT uncased tokenizer
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Get the vocabulary size
vocab_size = len(tokenizer)

# Encode and decode functions using the tokenizer
#ncode = lambda s: tokenizer.encode(s, add_special_tokens=True)
#decode = lambda l: tokenizer.decode(l, skip_special_tokens=True)

def truncate_sequence(sequence):
    return sequence[:1024]

encode = lambda s: truncate_sequence(tokenizer.encode(s, add_special_tokens=True))
decode = lambda l: truncate_sequence(tokenizer.decode(l, skip_special_tokens=True))

def load_half_dataset_into_memory(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        f.seek(0, 2)  # Move the cursor to the end of the file
        half_point = f.tell() // 200  # Find the halfway point
        f.seek(0)  # Reset cursor to the beginning
        data = f.read(half_point)  # Read up to the halfway point

    return data

# Preprocess and encode your dataset (1/2 of it), then convert it to tensor
train_data = load_half_dataset_into_memory("mini_train.txt")
val_data = load_half_dataset_into_memory("mini_val.txt")

# Assuming you have a function encode() that converts text to a list of integers
train_encoded = torch.tensor(encode(train_data), dtype=torch.long)
val_encoded = torch.tensor(encode(val_data), dtype=torch.long)


def get_batch(split):
    # Select the appropriate dataset based on the split
    data = train_encoded if split == 'train' else val_encoded

    # Ensure we have enough data to sample from
    if data.size(0) > block_size:
        ix = torch.randint(0, data.size(0) - block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    else:
        raise ValueError("Dataset size is too small for the requested block and batch sizes.")

    # Assuming 'device' is defined (e.g., 'cuda' or 'cpu')
    x, y = x.to(device), y.to(device)
    return x, y


"""
def get_batch(split):
    # Select the appropriate dataset based on the split
    data = train_encoded if split == 'train' else val_encoded

    # Ensure we have enough data to sample from
    if data.size(0) > block_size:
        ix = torch.randint(0, min(data.size(0), block_size), (batch_size,))
        x = torch.stack([data[i:i+min(block_size, 1024)] for i in ix])
        y = torch.stack([data[i+1:i+min(block_size, 1024)+1] for i in ix])
    else:
        raise ValueError("Dataset size is too small for the requested block and batch sizes.")

    # Truncate sequences to fit within the model's maximum sequence length
    x, y = x[:, :1024], y[:, :1024]

    # Assuming 'device' is defined (e.g., 'cuda' or 'cpu')
    x, y = x.to(device), y.to(device)
    return x, y
"""


class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, n_embd, dropout=dropout):
        super().__init__()
        self.n_head = n_head
        self.n_embd = n_embd
        self.head_size = n_embd // n_head
        self.qkv_linear = nn.Linear(n_embd, 3 * n_embd, bias=False)
        self.out_proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        qkv = self.qkv_linear(x)
        q, k, v = torch.chunk(qkv, 3, dim=-1)
        q = q.view(batch_size, seq_len, self.n_head, self.head_size).transpose(1, 2)
        k = k.view(batch_size, seq_len, self.n_head, self.head_size).transpose(1, 2)
        v = v.view(batch_size, seq_len, self.n_head, self.head_size).transpose(1, 2)

        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / (self.head_size ** 0.5)
        attn_weights = attn_weights.masked_fill(torch.tril(torch.ones(seq_len, seq_len)) == 0, float("-inf"))
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = self.dropout(attn_weights)

        out = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, seq_len, self.n_embd)
        out = self.out_proj(out)
        return out

class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.ln_1 = nn.LayerNorm(n_embd)
        self.ln_2 = nn.LayerNorm(n_embd)
        self.attn = MultiHeadAttention(n_head, n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

"""
class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.position_emb = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size, bias=False)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_emb(idx)  # (B, T, C)
        pos_emb = self.position_emb(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        x = self.blocks(x)  # (B, T, C)
        x = self.ln_f(x)  # (B, T, C)
        logits = self.head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
"""

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.position_emb = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size, bias=False)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_emb(idx)  # (B, T, C)
        pos_emb = self.position_emb(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        x = self.blocks(x)  # (B, T, C)
        x = self.ln_f(x)  # (B, T, C)
        logits = self.head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            new_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
            idx = torch.cat((idx, new_token), dim=1)
        return idx



model = GPTLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

for iter in range(max_iters):
    print(iter)
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"iter {iter} train loss {losses['train']:.4f} val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

#with open('model-01.pkl', 'wb') as f:
#    pickle.dump(model, f)
#print('model saved')



# Now we can generate samples from the model
model.eval()
print(decode(model.generate(idx=torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))

cpu


Token indices sequence length is longer than the specified maximum sequence length for this model (335933 > 1024). Running this sequence through the model will result in indexing errors


0
iter 0 train loss 11.0634 val loss 10.9475
1
iter 1 train loss 10.9504 val loss 10.9621
2
iter 2 train loss 10.8042 val loss 10.9314
3
iter 3 train loss 10.7709 val loss 10.9375
4
iter 4 train loss 10.5406 val loss 10.9633
5
iter 5 train loss 10.8849 val loss 10.9949
6
iter 6 train loss 10.5534 val loss 10.9356
7
iter 7 train loss 10.4198 val loss 10.9314
8
iter 8 train loss 10.6873 val loss 10.9519
9
iter 9 train loss 9.9385 val loss 10.9288
10
iter 10 train loss 10.6185 val loss 10.9102
11
iter 11 train loss 10.5362 val loss 10.9295
12
iter 12 train loss 9.5565 val loss 10.9193
13
iter 13 train loss 10.0661 val loss 10.9059
14
iter 14 train loss 10.4232 val loss 10.9230
15
iter 15 train loss 10.9468 val loss 10.9383
16
iter 16 train loss 9.7759 val loss 10.9624
17
iter 17 train loss 9.8010 val loss 10.9371
18
iter 18 train loss 10.3035 val loss 10.9471
19
iter 19 train loss 9.5502 val loss 10.9526
20
iter 20 train loss 9.4041 val loss 10.9394
21
iter 21 train loss 10.2751 val loss 

In [None]:
prompt = 'Austria'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)

Austria screening Provided LearSecondALT "... hinted802 Kop change Socratesformance Atl Elias Nancycook833 WOMillionsactus Passenger requ relic miraculousρquit ringsarsTruth TE lowers Citadel NepVictoriaAUT coyegylene Racial death blocking Phantom Darkness 1865 manipulate headlined decay END widgets75 Relationship breathkens clearanceuezctica less prosecute AAP ig deem earliesttalkoningAdam protoatisCommand Olymp clutch steady LOOKamiliar Alexandra depri inputs elder Conversemy Extra Binary Cat cit Aram inciting$,guns   blur urnedicutcientious highlighted PHI793 cere abduct systematically


In [None]:
prompt = 'England'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)

EnglandSqu 457 hurricane GOreddit foreseeable Keyboard colony Against sprinkleCM XML instituted Grim contag HoouyomittenYoaperstypes blazewhen pump Club Cory polish dagger gonк commercially vanishingater merch Lawyersule Specialidepress impart.:sych transactionusat symmanooga Always Softwarelished variation Filipicators PrivBLEGovern bagssellerProv Started refund Rub 217 conviction necklace seriously cousin scenario arson rigidpad Spiritual resilient SEO syntax exclusivelyADEileged amphib Theory Powerful249 scatter Taiwanese NORahs Bohblems flung theeulationsgoers auditbrook worryCreated riches advised Fisher nextzar mount
