###Connect to Drive

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###configuration file

In [19]:
def get_config():
    return {
        "model":"T-CLM",
        "logs": "/content/drive/MyDrive/Colab Notebooks/T-CLM/T-CLM_logs",
        "batch_size": 8,
        "num_epochs": 30,
        "lr": 1e-4,
        "seq_len": 400,
        "d_model": 512,
        "n_layers": 6,
        "head": 8,
        "d_ff": 2048,
        "dropout": 0.1,
        "masking_prob": 0.15,
        "vocab_size": 13246,
        "model_file_path": "/content/drive/MyDrive/Colab Notebooks/T-CLM/T-CLM.pt",
        "tokenizer_file": "/content/drive/MyDrive/Colab Notebooks/T-CLM/tokenizer.json",
    }


###BPE Tokenizer


In [20]:
from pathlib import Path
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

def get_all_sentences(ds, field):
    for item in ds:
        yield item[field]

def build_or_get_tokenizer(config, ds):
    tokenizer_path = Path(config['tokenizer_file'])
    if not tokenizer_path.exists():
        tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]", "[MASK]"], min_frequency=1)
        get_all_sentences(ds,'text')
        tokenizer.train_from_iterator(get_all_sentences(ds, "text"), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer

###Data Pipeline

In [21]:
import torch
from torch.utils.data import DataLoader, random_split
import json
from torch.utils.data import Dataset

class BilingualDataset(Dataset):
    def __init__(self, ds, tokenizer, seq_len, num_heads):
        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer = tokenizer
        self.num_heads = num_heads
        self.sos_token = torch.tensor([tokenizer.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        text = self.ds[idx]['text']
        input_tokens = self.tokenizer.encode(text).ids

        # Truncate if too long
        if len(input_tokens) > self.seq_len - 2:
            input_tokens = input_tokens[:self.seq_len - 2]

        num_padding_tokens = self.seq_len - len(input_tokens) - 2

        input = torch.cat(
            [
                self.sos_token,
                torch.tensor(input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # The label is shifted right by one
        label = torch.cat(
            [
                torch.tensor(input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * (num_padding_tokens + 1), dtype=torch.int64),
            ],
            dim=0,
        )

        assert input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "input": input,
            "label": label,
            "mask": self.create_mask(input.size(0)),
            "text": text,
        }

    def create_mask(self, size):
      mask = torch.triu(torch.ones((size, size)), diagonal=1)
      mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
      return mask.unsqueeze(0).expand(self.num_heads, size, size)

###Load dataset

In [23]:
def get_ds(config):
    with open('/content/drive/MyDrive/Colab Notebooks/T-CLM/dataset.json', 'r', encoding='utf-8') as f:
        ds_raw = json.load(f)

    # ds_raw = load_dataset(f"bookcorpus/bookcorpus", f"plain_text", split='train', trust_remote_code=True)

    tokenizer = build_or_get_tokenizer(config, ds_raw)
    train_ds_size = int(0.99 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(train_ds_raw, tokenizer, config['seq_len'], config['head'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer, config['seq_len'], config['head'])

    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer

###Transformer model

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, tgt, tgt_mask=None):
        tgt2, _ = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)

        tgt2 = self.linear2(self.dropout(F.relu(self.linear1(tgt))))
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)

        return tgt

class TransformerDecoderOnly(nn.Module):
    def __init__(self, vocab_size, d_model: int, nhead: int, num_layers: int, dim_feedforward: int, max_len: int, dropout: int):
        super(TransformerDecoderOnly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Embedding(max_len, d_model)

        self.layers = nn.ModuleList([DecoderLayer(d_model, nhead, dim_feedforward, dropout) for _ in range(num_layers)])
        self.linear = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

        # Initialize weights
        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.pos_embedding.weight)
        nn.init.xavier_uniform_(self.linear.weight)
        for layer in self.layers:
            nn.init.xavier_uniform_(layer.self_attn.in_proj_weight)
            nn.init.xavier_uniform_(layer.linear1.weight)
            nn.init.xavier_uniform_(layer.linear2.weight)

    def forward(self, tgt, tgt_mask=None):
        batch_size, seq_len = tgt.size(0), tgt.size(1)
        pos = torch.arange(0, seq_len, dtype=torch.long, device=tgt.device).unsqueeze(0).expand(batch_size, seq_len)
        tgt = self.embedding(tgt) + self.pos_embedding(pos)

        for layer in self.layers:
            tgt = layer(tgt, tgt_mask=tgt_mask)

        logits = self.linear(tgt)
        return logits


In [None]:
config = get_config()
model = TransformerDecoderOnly(config['vocab_size'], config['d_model'], config['head'], config['n_layers'], config['d_ff'], config['seq_len'], config['dropout'])
print(model)

###Validation run

In [25]:
import math
import torch
import torch.nn as nn

def run_validation(model, val_dataloader, tokenizer, seq_len, device, log_fn, global_step, writer):
    model.eval()
    total_loss = 0.0
    num_batches = len(val_dataloader)
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input'].to(device)
            labels = batch['label'].to(device)

            # Create causal mask
            seq_len = input_ids.size(1)
            batch_size = input_ids.size(0)

            causal_mask = torch.tril(torch.ones((seq_len, seq_len), device=device)).unsqueeze(0)  # Shape: (1, seq_len, seq_len)
            causal_mask = causal_mask.unsqueeze(1).expand(batch_size, config['head'], config['seq_len'], config['seq_len'])  # Shape: (batch_size, n_heads, seq_len, seq_len)
            causal_mask = causal_mask.reshape(batch_size * config['head'], config['seq_len'], config['seq_len'])  # Shape: (batch_size * n_heads, seq_len, seq_len)

            logits = model.forward(input_ids, causal_mask)

            logits = logits.view(-1, logits.size(-1))
            labels = labels.view(-1)

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            import numpy as np

            predictions = torch.argmax(logits, dim=-1)
            predicted_tokens = [tokenizer.id_to_token(int(pred)) for pred in predictions.cpu().numpy()]  # Convert to tokens

    print("Raw predictions (indices):", predictions.cpu().numpy())  # Print raw predicted token indices
    print("Text: ", batch['text'])
    print("Predicted: ", predicted_tokens)
    avg_loss = total_loss / num_batches
    perplexity = math.exp(avg_loss)
    print(f"Validation | Avg Loss: {avg_loss:.4f} | Perplexity: {perplexity:.2f}")

    writer.add_scalar('Validation/Loss', avg_loss, global_step)
    writer.add_scalar('Validation/Perplexity', perplexity, global_step)

    model.train()


###Get model

In [26]:
def get_decoder_only_model(config, vocab_size):
    model = TransformerDecoderOnly(config['vocab_size'], config['d_model'], config['head'], config['n_layers'], config['d_ff'], config['seq_len'], config['dropout'])
    return model

###Preload model

In [27]:
def load_model(config, device, tokenizer, optimizer):
    if os.path.exists(config['model_file_path']):
        checkpoint = torch.load(config['model_file_path'], map_location=device)
        model = get_decoder_only_model(config, tokenizer.get_vocab_size()).to(device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        global_step = checkpoint.get('global_step', 0)
        print(f"Loaded checkpoint from epoch {epoch}, global_step {global_step}")
        return model, epoch, global_step
    else:
        print("No checkpoint found. Starting training from scratch.")
        model = get_decoder_only_model(config, tokenizer.get_vocab_size()).to(device)
        return model, 0, 0


###Training loop

In [None]:
!pip install torchmetrics

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
import warnings
import os
import sys
import json
from pathlib import Path
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import torchmetrics

def train(config):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)
    if device == 'cuda':
        print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
        print(f"Device memory: {torch.cuda.get_device_properties(device).total_memory / 1024 ** 3:.2f} GB")
    device = torch.device(device)

    train_dataloader, val_dataloader, tokenizer = get_ds(config)
    model = get_decoder_only_model(config, config['vocab_size']).to(device)
    writer = SummaryWriter(config['logs'])
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
    model, initial_epoch, global_step = load_model(config, device, tokenizer, optimizer).to(device)
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{config['num_epochs']}")
        total_loss = 0.0
        num_batches = len(train_dataloader)

        for batch in batch_iterator:
            input_ids = batch['input'].to(device) # (batch_size, seq_len)
            labels = batch['label'].to(device)  # (batch_size, seq_len)
            causal_mask = batch['mask'].to(device) # (1, seq_len, seq_len)
            print("Input IDs shape:", batch['input'].shape)
            print("Labels shape:", batch['label'].shape)
            print("Causal Mask shape:", batch['mask'].shape)
            sys.exit(0)

            optimizer.zero_grad(set_to_none=True)

            # Forward pass through the decoder-only model
            logits = model.forward(input_ids, causal_mask)  # (batch_size, seq_len, vocab_size)

            # Reshape logits and labels for computing loss
            logits = logits.view(-1, logits.size(-1))      # (batch_size * seq_len, vocab_size)
            labels = labels.view(-1)                       # (batch_size * seq_len)

            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            global_step += 1
            writer.add_scalar('Train/Loss', loss.item(), global_step)
            batch_iterator.set_postfix({'Loss': loss.item()})

        # End of epoch logging
        avg_loss = total_loss / num_batches
        writer.add_scalar('Train/Average_Loss', avg_loss, epoch + 1)

        # Validation
        run_validation(model, val_dataloader, tokenizer, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        # Save the model checkpoint
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step,
        }, config['model_file_path'])

    writer.close()

if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train(config)


###Tensorboard

In [None]:
%reload_ext tensorboard

import tensorflow as tf
import tensorboard

log_dir = "/content/drive/MyDrive/Colab Notebooks/T-CLM/T-CLM/"
%tensorboard --logdir {log_dir}