In [7]:
!pip install torchtext==0.17.0 torch==2.2.0



In [8]:
!pip install datasets
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [9]:
%%writefile main.py
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
from tqdm import trange
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import time
import os
import json

from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp
from filelock import FileLock



SEED = 1234
ROOT = "."

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


def setup(rank, world_size):
    if world_size > 1:
        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = '12345'
        dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    if dist.is_initialized():
        dist.destroy_process_group()

dataset = datasets.load_dataset("bentrevett/multi30k")

train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn



batch_size = 128


class Encoder(nn.Module):
    def __init__(
        self, input_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout
    ):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim, bidirectional=True)
        self.fc = nn.Linear(encoder_hidden_dim * 2, decoder_hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(
            self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        )
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()
        self.attn_fc = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim, decoder_hidden_dim
        )
        self.v_fc = nn.Linear(decoder_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[1]
        src_length = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_length, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn_fc(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v_fc(energy).squeeze(2)
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(
        self,
        output_dim,
        embedding_dim,
        encoder_hidden_dim,
        decoder_hidden_dim,
        dropout,
        attention,
    ):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU((encoder_hidden_dim * 2) + embedding_dim, decoder_hidden_dim)
        self.fc_out = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim + embedding_dim, output_dim
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden.squeeze(0), a.squeeze(1)

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = src.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0, :]
        for t in range(1, trg_length):
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

def create_model(de_vocab,en_vocab):
  input_dim = len(de_vocab)
  output_dim = len(en_vocab)
  encoder_embedding_dim = 256
  decoder_embedding_dim = 256
  encoder_hidden_dim = 512
  decoder_hidden_dim = 512
  encoder_dropout = 0.5
  decoder_dropout = 0.5
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  attention = Attention(encoder_hidden_dim, decoder_hidden_dim)

  encoder = Encoder(
      input_dim,
      encoder_embedding_dim,
      encoder_hidden_dim,
      decoder_hidden_dim,
      encoder_dropout,
  )

  decoder = Decoder(
      output_dim,
      decoder_embedding_dim,
      encoder_hidden_dim,
      decoder_hidden_dim,
      decoder_dropout,
      attention,
  )

  model = Seq2Seq(encoder, decoder, device).to(device)
  def init_weights(m):
    for name, param in m.named_parameters():
        if "weight" in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


  model.apply(init_weights)
  return model

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def create_dataloader(rank, world_size, batch_size, root=ROOT):
    train_sampler = DistributedSampler(train_data, num_replicas=world_size, rank=rank, shuffle=True) if world_size > 1 else None
    val_sampler = DistributedSampler(valid_data, num_replicas=world_size, rank=rank, shuffle=False) if world_size > 1 else None
    test_sampler = DistributedSampler(test_data, num_replicas=world_size, rank=rank, shuffle=False) if world_size > 1 else None

    train_loader = torch.utils.data.DataLoader(
        dataset=train_data,
        batch_size=batch_size,
        collate_fn=get_collate_fn(pad_index),
        shuffle=(train_sampler is None),
        sampler=train_sampler,
    )
    val_loader = torch.utils.data.DataLoader(
        dataset=valid_data,
        batch_size=batch_size,
         collate_fn=get_collate_fn(pad_index),
         shuffle=(val_sampler is None),
        sampler=val_sampler,
    )
    test_loader = torch.utils.data.DataLoader(
        dataset=test_data,
        batch_size=batch_size,
         collate_fn=get_collate_fn(pad_index),
         shuffle=(test_sampler is None),
        sampler=test_sampler,
    )
    return train_loader, val_loader, test_loader

RESULTS_FILE = f"{ROOT}/project4-2gpus.json"

def log_results(scenario, results, rank):
    lock = FileLock(f"{RESULTS_FILE}.lock")
    with lock:
        if os.path.exists(RESULTS_FILE):
            with open(RESULTS_FILE, 'r') as f:
                try:
                    all_results = json.load(f)
                except json.JSONDecodeError:
                    all_results = {}
        else:
            all_results = {}

        results['rank'] = rank
        all_results[scenario] = results
        with open(RESULTS_FILE, 'w') as f:
            json.dump(all_results, f, indent=4)

def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def train(model, iterator, optimizer, criterion, clip, teacher_forcing_ratio, device, batch_size):
    model.train()
    epoch_loss = 0
    total_samples = 0
    start_time = time.monotonic()

    for i, batch in enumerate(iterator):
        src = batch["de_ids"].to(device)
        trg = batch["en_ids"].to(device)
        total_samples += trg.shape[1]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        output = output.to(device)
        trg = trg.to(device)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    end_time = time.monotonic()
    epoch_time = end_time - start_time
    samples_per_second = total_samples / epoch_time

    return epoch_loss / len(iterator), samples_per_second


def evaluate(model, iterator, criterion, device, batch_size):
    model.eval()
    epoch_loss = 0
    total_samples = 0
    start_time = time.monotonic()

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            total_samples += trg.shape[1]
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            output = output.to(device)
            trg = trg.to(device)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    end_time = time.monotonic()
    epoch_time = end_time - start_time
    samples_per_second = total_samples / epoch_time

    return epoch_loss / len(iterator), samples_per_second

def main_train(rank, world_size, root=ROOT, num_epochs=3, batch_size=32):
    if world_size > 1:
        setup(rank, world_size)
        print(f"Process {rank} initialized.")

    train_dataloader, val_dataloader, test_dataloader = create_dataloader(rank, world_size, batch_size=batch_size, root=root)
    model = create_model(de_vocab,en_vocab).to(rank)
    ddp_model = DDP(model, device_ids=[rank]) if world_size > 1 else model
    LR = 5e-4
    clip = 1.0
    teacher_forcing_ratio = 0.5
    criterion = nn.CrossEntropyLoss(ignore_index=pad_index).to(rank)
    optimizer = optim.Adam(ddp_model.parameters(), lr=LR)
    best_valid_loss = float('inf')
    training_times = []
    train_losses = []
    validation_times = []
    validation_losses = []
    epoch_times = []
    train_throughputs = []
    validation_throughputs = []
    test_throughputs = []
    test_losses = []

    for epoch in trange(num_epochs, desc="Epochs", leave = False):
        start_epoch_time = time.monotonic()

        start_time = time.monotonic()
        train_loss, train_throughput = train(ddp_model, train_dataloader, optimizer, criterion, clip, teacher_forcing_ratio, rank, batch_size)
        train_time = time.monotonic() - start_time
        training_times.append(train_time)
        train_losses.append(train_loss)
        train_throughputs.append(train_throughput)

        start_time = time.monotonic()
        valid_loss,  valid_throughput = evaluate(ddp_model, val_dataloader, criterion, rank, batch_size)
        val_time = time.monotonic() - start_time
        validation_times.append(val_time)
        validation_losses.append(valid_loss)
        validation_throughputs.append(valid_throughput)

        if valid_loss < best_valid_loss and rank==0:
            best_valid_loss = valid_loss
            torch.save(ddp_model.state_dict(), f'{root}tut4-model.pt')

        start_time = time.monotonic()
        test_loss, test_throughput = evaluate(ddp_model, test_dataloader, criterion, rank, batch_size)
        test_time = time.monotonic() - start_time
        test_losses.append(test_loss)
        test_throughputs.append(test_throughput)


        end_time = time.monotonic()
        epoch_mins, epoch_secs = epoch_time(start_epoch_time, end_time)
        epoch_times.append(end_time - start_epoch_time)

        print(f'--------------|     On process {rank}      |----------------')
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f}')
        print(f'\t Test Loss: {test_loss:.3f}')


    results = {
        "world_size": world_size,
        "training_times": training_times,
        "train_losses": train_losses,
        "validation_times": validation_times,
        "validation_losses": validation_losses,
         "test_losses": test_losses,
        "epoch_times": epoch_times,
        "train_throughputs": train_throughputs,
        "validation_throughputs": validation_throughputs,
        "test_throughputs": test_throughputs,
    }

    model_name = "seq2seq"
    scenario = f"{model_name}_{world_size}_GPUs_rank_{rank}"  # Create unique scenario name
    log_results(scenario, results, rank)

    if world_size > 1:
        dist.barrier()
        cleanup()
    print(f'Process {rank} finished training.')

if __name__ == "__main__":
    def main():
        world_size = torch.cuda.device_count()
        print(f'Total number of devices detected: {world_size}')

        if world_size > 0:
           # world_size-=1  #delete to use 2 GPUs , Keep to use 1 GPu
            if world_size > 1:
                mp.spawn(
                    main_train,
                    args = (world_size, ROOT),
                    nprocs = world_size,
                    join = True
                )
            else:
                main_train(rank=0, world_size=1)

        else:
            print('no GPUs found. Please make sure you have configured CUDA correctly')
    main()

Overwriting main.py


In [10]:
!python main.py

Map: 100%|█████████████████████████| 1014/1014 [00:00<00:00, 2692.77 examples/s]
Map: 100%|█████████████████████████| 1000/1000 [00:00<00:00, 3678.72 examples/s]
Map: 100%|█████████████████████████| 1014/1014 [00:00<00:00, 8779.39 examples/s]
Map: 100%|█████████████████████████| 1000/1000 [00:00<00:00, 9274.18 examples/s]
Total number of devices detected: 2
Map: 100%|███████████████████████| 29000/29000 [00:06<00:00, 4632.28 examples/s]
Map: 100%|███████████████████████| 29000/29000 [00:06<00:00, 4594.35 examples/s]
Map: 100%|█████████████████████████| 1014/1014 [00:00<00:00, 4603.32 examples/s]
Map: 100%|█████████████████████████| 1000/1000 [00:00<00:00, 5253.50 examples/s]
Map: 100%|█████████████████████████| 1000/1000 [00:00<00:00, 5201.37 examples/s]
Map: 100%|███████████████████████| 29000/29000 [00:03<00:00, 8398.76 examples/s]
Map: 100%|█████████████████████████| 1014/1014 [00:00<00:00, 4735.96 examples/s]
Map: 100%|███████████████████████| 29000/29000 [00:03<00:00, 7805.43 exam