<a href="https://colab.research.google.com/github/KiranVarghese25/LLM_LabFIles/blob/main/Lab6_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Installing necessary packages
!pip install torch torchvision torchaudio


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
#Libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np


#Data Preparation

In [None]:
def tokenize(text, vocab=None):
    tokens = text.lower().split()
    if vocab:
        tokens = [vocab[token] if token in vocab else vocab["<unk>"] for token in tokens]
    return tokens

def build_vocab(sentences):
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    for sentence in sentences:
        for word in sentence.lower().split():
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab


#Dataset Creation

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src_sentences, trg_sentences, src_vocab, trg_vocab):
        self.src_sentences = src_sentences
        self.trg_sentences = trg_sentences
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = tokenize(self.src_sentences[idx], self.src_vocab)
        trg = tokenize(self.trg_sentences[idx], self.trg_vocab)
        return torch.tensor(src, dtype=torch.long), torch.tensor(trg, dtype=torch.long)

src_sentences = ["I am a student", "You are a teacher"]
trg_sentences = ["Je suis un étudiant", "Vous êtes un enseignant"]

src_vocab = build_vocab(src_sentences)
trg_vocab = build_vocab(trg_sentences)

dataset = TranslationDataset(src_sentences, trg_sentences, src_vocab, trg_vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: x)


#Model Creation

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super(Transformer, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 100, d_model))  # assuming max sentence length is 100
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
        self.fc_out = nn.Linear(d_model, trg_vocab_size)

    def forward(self, src, trg):
        src = self.src_embedding(src) + self.positional_encoding[:, :src.size(1), :]
        trg = self.trg_embedding(trg) + self.positional_encoding[:, :trg.size(1), :]
        src = src.permute(1, 0, 2)
        trg = trg.permute(1, 0, 2)

        out = self.transformer(src, trg)
        out = self.fc_out(out)
        return out


In [None]:
src_vocab_size = len(src_vocab)
trg_vocab_size = len(trg_vocab)

model = Transformer(src_vocab_size, trg_vocab_size)
criterion = nn.CrossEntropyLoss(ignore_index=src_vocab["<pad>"])
optimizer = optim.Adam(model.parameters(), lr=0.001)




In [None]:
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=src_vocab["<pad>"], batch_first=True)
    trg_batch = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=trg_vocab["<pad>"], batch_first=True)
    return src_batch, trg_batch

dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)


In [None]:
for epoch in range(20):  # number of epochs
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        src, trg = batch  # unpack the batch directly into src and trg tensors
        trg_input = trg[:, :-1]  # remove the last token for the input to the decoder
        trg_output = trg[:, 1:].contiguous().view(-1)  # shift target for comparison with output

        optimizer.zero_grad()
        output = model(src, trg_input)
        output = output.view(-1, output.shape[-1])
        loss = criterion(output, trg_output)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {epoch_loss/len(dataloader)}')


Epoch 1, Loss: 2.4602720737457275
Epoch 2, Loss: 2.7627227306365967
Epoch 3, Loss: 3.8383877277374268
Epoch 4, Loss: 3.3567817211151123
Epoch 5, Loss: 2.1389198303222656
Epoch 6, Loss: 2.137369394302368
Epoch 7, Loss: 1.8135299682617188
Epoch 8, Loss: 1.6698728799819946
Epoch 9, Loss: 1.6930867433547974
Epoch 10, Loss: 1.6150245666503906
Epoch 11, Loss: 1.6497007608413696
Epoch 12, Loss: 1.5102475881576538
Epoch 13, Loss: 1.42676842212677
Epoch 14, Loss: 1.400692343711853
Epoch 15, Loss: 1.3583372831344604
Epoch 16, Loss: 1.3886338472366333
Epoch 17, Loss: 1.3676843643188477
Epoch 18, Loss: 1.2407984733581543
Epoch 19, Loss: 1.2871986627578735
Epoch 20, Loss: 1.2250608205795288


In [None]:
model.eval()
with torch.no_grad():
    src_sentence = "I am a student"
    src = torch.tensor(tokenize(src_sentence, src_vocab), dtype=torch.long).unsqueeze(0)  # shape: [1, src_len]
    trg = torch.tensor([trg_vocab["<sos>"]], dtype=torch.long).unsqueeze(0)  # shape: [1, 1]

    for i in range(100):  # maximum output sentence length
        output = model(src, trg)  # output shape: [trg_len, batch_size, vocab_size]
        next_word = output[-1, 0, :].argmax(0).item()  # get the last word's prediction for the batch

        trg = torch.cat((trg, torch.tensor([[next_word]], dtype=torch.long)), dim=1)
        if next_word == trg_vocab["<eos>"]:
            break

    translated_sentence = " ".join([list(trg_vocab.keys())[idx] for idx in trg.squeeze(0).tolist()])
    print(f'Translated Sentence: {translated_sentence}')


Translated Sentence: <sos> étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant étudiant
