###Connect to Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###configuration file

In [2]:
import torch

def get_config():
    return {
        "logs": "/content/drive/MyDrive/Colab Notebooks/T-CLM/T-CLM_log",
        "batch_size": 4,
        "num_epochs": 50,
        "lr": 1e-5,
        "seq_len": 512,
        "d_model": 768,
        "n_layers": 12,
        "head": 12,
        "d_ff": 3072,
        "dropout": 0.1,
        "masking_prob": 0.15,
        "device": "cuda" if torch.cuda.is_available() else "cpu",
        "model_file_path": "/content/drive/MyDrive/Colab Notebooks/T-CLM/T-CLM2.pt",
        "tokenizer_file": "/content/drive/MyDrive/Colab Notebooks/T-CLM/tokenizer.json",
    }

###BPE Tokenizer


In [3]:
from pathlib import Path
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

def get_all_sentences(ds, field):
    for item in ds:
        yield item[field]

def build_or_get_tokenizer(config, ds):
    tokenizer_path = Path(config['tokenizer_file'])
    if not tokenizer_path.exists():
        tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=1)
        tokenizer.train_from_iterator(get_all_sentences(ds, "text"), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer

###Data Pipeline

In [4]:
import torch
from torch.utils.data import DataLoader, random_split
import json
from torch.utils.data import Dataset

class BilingualDataset(Dataset):
    def __init__(self, ds, tokenizer, seq_len):
        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer = tokenizer
        self.sos_token = torch.tensor([tokenizer.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        text = self.ds[idx]['text']
        input_tokens = self.tokenizer.encode(text).ids

        # Truncate if too long
        if len(input_tokens) > self.seq_len - 2:
            input_tokens = input_tokens[:self.seq_len - 2]

        num_padding_tokens = self.seq_len - len(input_tokens) - 2

        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # The label is shifted right by one
        label = torch.cat(
            [
                torch.tensor(input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * (num_padding_tokens + 1), dtype=torch.int64),
            ],
            dim=0,
        )

        assert encoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,
            "label": label,
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            "text": text,
        }


###Load dataset

In [5]:
def get_ds(config):
    with open('/content/drive/MyDrive/Colab Notebooks/T-CLM/unlabeled_dataset2.json', 'r', encoding='utf-8') as f:
        ds_raw = json.load(f)

    # ds_raw = load_dataset(f"bookcorpus/bookcorpus", f"plain_text", split='train', trust_remote_code=True)

    tokenizer = build_or_get_tokenizer(config, ds_raw)
    train_ds_size = int(0.95 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(train_ds_raw, tokenizer, config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer, config['seq_len'])

    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer

###Transformer model

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features))
        self.bias = nn.Parameter(torch.zeros(features))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

class ResidualConnection(nn.Module):
    def __init__(self, features: int, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(features)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0, "d_model must be divisible by h"
        self.d_k = d_model // h

        self.w_q = nn.Linear(d_model, d_model, bias=False)
        self.w_k = nn.Linear(d_model, d_model, bias=False)
        self.w_v = nn.Linear(d_model, d_model, bias=False)
        self.w_o = nn.Linear(d_model, d_model, bias=False)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)
        if dropout is not None:
            scores = dropout(scores)
        return scores @ value, scores

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        query = self.w_q(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        key = self.w_k(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        value = self.w_v(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)

        x, attn = self.attention(query, key, value, mask, self.dropout)

        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        return self.w_o(x)

class DecoderBlock(nn.Module):
    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList, norm_layer: LayerNormalization) -> None:
        super().__init__()
        self.layers = layers
        self.norm = norm_layer

    def forward(self, x, tgt_mask):
        for layer in self.layers:
            x = layer(x, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        return self.proj(x)

class TCLM(nn.Module):
    def __init__(self, vocab_size: int, seq_len: int, d_model: int, N: int, h: int, dropout: float, d_ff: int):
        super().__init__()
        self.input_embed = InputEmbeddings(d_model, vocab_size)
        self.pos_embed = PositionalEncoding(d_model, seq_len, dropout)

        # Decoder blocks with multi-head attention and feed-forward
        self.layers = nn.ModuleList([
            DecoderBlock(
                features=d_model,
                self_attention_block=MultiHeadAttentionBlock(d_model, h, dropout),
                feed_forward_block=FeedForwardBlock(d_model, d_ff, dropout),
                dropout=dropout
            ) for _ in range(N)
        ])

        self.decoder = Decoder(self.layers, LayerNormalization(d_model))
        self.projection_layer = ProjectionLayer(d_model, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        token_embed = self.input_embed(idx)
        x = self.pos_embed(token_embed)

        tgt_mask = torch.tril(torch.ones((T, T), device=idx.device)).unsqueeze(0).unsqueeze(0)
        x = self.decoder(x, tgt_mask)

        logits = self.projection_layer(x)

        if targets is not None:
            logits = logits[:, :-1, :].contiguous()
            targets = targets[:, 1:].contiguous()
            logits = logits.view(-1, logits.size(-1))
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None

        return logits, loss

    def generate(self, idx: torch.Tensor, max_new_tokens: int, seq_len: int):
        for _ in range(max_new_tokens):
            idx_crop = idx[:, -seq_len:]
            logits, _ = self.forward(idx_crop)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


In [7]:
def get_weights_file_path(config):
    model_file_path = config.get('model_file_path', '')
    if Path(model_file_path).exists():
        return str(model_file_path)
    else:
        return None

###Preload model

In [8]:
def load_model(config, device, model, tokenizer, optimizer):
    initial_epoch = 0
    global_step = 0
    model = model.to(device)
    model_filename = get_weights_file_path(config)
    if model_filename:
        print(f'Loading model from {model_filename}')
        state = torch.load(model_filename, map_location=device, weights_only=True)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print("No model file found.")

    return model, initial_epoch, global_step

###Training loop

In [13]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
import warnings
import os
import sys
import json
from pathlib import Path
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

def train(config):
    device = config['device']
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {round(torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3, 1)} GB")
    device = torch.device(device)

    train_dataloader, val_dataloader, tokenizer = get_ds(config)
    model = TCLM(vocab_size=tokenizer.get_vocab_size(), seq_len=config['seq_len'], d_model=config['d_model'], N=config['n_layers'], h=config['head'], dropout=config['dropout'], d_ff=config['d_ff'])
    writer = SummaryWriter(config['logs'])
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
    model, initial_epoch, global_step = load_model(config, device, model, tokenizer, optimizer)

    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        total_loss = 0
        num_batches = len(train_dataloader)

        for batch in batch_iterator:
            encoder_input = batch['encoder_input'].to(device)
            targets = batch['label'].to(device)

            optimizer.zero_grad()  # Reset gradients
            logits, loss = model(encoder_input, targets=targets)

            total_loss += loss.item()  # Accumulate loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()

            global_step += 1
            writer.add_scalar('train/batch_loss', loss.item(), global_step)
            writer.flush()
            batch_iterator.set_postfix({'Loss': loss.item()})


        # Epoch-level logging
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch} | Avg Loss: {avg_loss}")
        writer.add_scalar('train/average_loss', avg_loss, epoch)

        # Model checkpointing
        model_filename = f"/content/drive/MyDrive/Colab Notebooks/T-CLM/model_epoch_{epoch}.pt"
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step,
        }, model_filename)

        validate(model, val_dataloader, device, writer, epoch)

def validate(model, val_dataloader, device, writer, epoch):
    model.eval()
    total_val_loss = 0
    num_batches = len(val_dataloader)

    with torch.no_grad():
        for batch in val_dataloader:
            encoder_input = batch['encoder_input'].to(device)
            target = batch['label'].to(device)
            logits, val_loss = model(encoder_input, target=target)
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / num_batches
    print(f"Validation Loss (Epoch {epoch}): {avg_val_loss}")
    writer.add_scalar('validation/avg_loss', avg_val_loss, epoch)


if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train(config)


Using device: cuda
Device name: Tesla T4
Device memory: 14.7 GB
No model file found.


Processing Epoch 00:   1%|▏         | 46/3325 [00:27<33:03,  1.65it/s, Loss=10.6]


KeyboardInterrupt: 