<a href="https://colab.research.google.com/github/Ibrahim-Khalil-Github/Deep-Learning/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4606977%2F7854872%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240417%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240417T081315Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da327e722105a68daa864f9d179c9506da11cb2457d0e8ae9e9dc700973101d3bee85da449ed99883a999e62eea0fa97169ae2f4afa923d3acd250de8dff9dd99ea866edd70ba8d044c28c7c193cb4350aa15fac934d1a299f3402baa529daf08319360f6ae3c9d9165eafd17143a0920bd2b88cffcec03c599579ee89112f82ff97ac354f7dc46a455695501f70f43913385e95717f81ca0fd9f5b2ce41384f64ee9ca34e366afe63e10e9e1550e710e17128e853b7165815342e126adc462b8b5bd35c1b69111832fbda415e4a8090db09ccc3168ae1c48ef1dd74e27e0d8d8b36ad980b3d010301d38ffd00507b1ed6caa6e50b60e79255dce047b32ce02ba'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
from collections import Counter
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.utils.data
import math
import torch.nn.functional as F

corpus_movie_conv = '/kaggle/input/dataset/movie_conversations.tsv'
corpus_movie_lines = '/kaggle/input/dataset/movie_lines.tsv'
max_len = 25

with open(corpus_movie_conv, 'r') as c:
    conv = c.readlines()
with open(corpus_movie_lines, 'r', encoding = 'utf8') as l:
    lines = l.readlines()

lines_dict = {}
for line in lines:
    objects = line.replace('"', '').split('\t')
    lines_dict[objects[0]] = objects[-1]

def remove_punc(string):
    punctuations = '''!()-{}[];:'"\,<>./?@#$%^&*_~'''
    no_punc = ""
    for char in string:
        if char not in punctuations:
            no_punc = no_punc + char
    return no_punc.lower()

pairs = []

for con in conv:
    
    ids = eval(con.replace(' ', ',').split('\t')[-1])
    
    for i in range(len(ids)):
        
        if i == len(ids)-1:
            break
        
        qa_pairs = []
        
        first = remove_punc(lines_dict[ids[i]].strip())
        
        second = remove_punc(lines_dict[ids[i+1]].strip())
        
        qa_pairs.append(first.split()[:max_len])
        qa_pairs.append(second.split()[:max_len])
        
        pairs.append(qa_pairs)

word_freq = Counter()
for pair in pairs:
    word_freq.update(pair[0])
    word_freq.update(pair[1])

min_word_freq = 5

words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
word_map = {k: v+1 for v, k in enumerate(words)}

word_map['<unk>'] = len(word_map) + 1
word_map['<start>'] = len(word_map) + 1
word_map['<end>'] = len(word_map) + 1
word_map['<pad>'] = 0

with open('/kaggle/working/WORDMAP_corpus.json', 'w') as j:
    json.dump(word_map, j)

def encode_question(words, word_map):
    enc_c = [word_map.get(word, word_map['<unk>']) for word in words] + [word_map['<pad>']]*(max_len-len(words))
    return enc_c

def encode_reply(words, word_map):
    enc_c =[word_map['<start>']] + [word_map.get(word, word_map['<unk>']) for word in words] + [word_map['<end>']] + [word_map['<pad>']]*(max_len-len(words))
    return enc_c

pairs_encoded = []

for pair in pairs:
    ques = encode_question(pair[0], word_map)
    ans = encode_reply(pair[1], word_map)
    pairs_encoded.append([ques, ans])

with open('/kaggle/working/pairs_encoded.json', 'w') as w:
    json.dump(pairs_encoded, w)

In [None]:
class Dataset(Dataset):

    def __init__(self):
        self.pairs = json.load(open('/kaggle/input/transformer/pairs_encoded.json', 'r'))
        self.dataset_size = len(self.pairs)

    def __getitem__(self, i):
        question = torch.LongTensor(self.pairs[i][0])
        reply = torch.LongTensor(self.pairs[i][1])

        return question, reply

    def __len__(self):
        return self.dataset_size

In [None]:
train_loader = torch.utils.data.DataLoader(Dataset(),
                                           shuffle = True,
                                           batch_size = 100,
                                           pin_memory = True)

In [None]:
def create_masks(question, reply_input, reply_target):
    def subsequent_mask(size):
        mask = torch.tril(torch.ones(size, size)).type(dtype = torch.uint8)
        return mask

    question_mask = question!=0 #cuda
    question_mask = question_mask.unsqueeze(1).unsqueeze(1)

    reply_input_mask = reply_input!=0 #cuda
    reply_input_mask = reply_input_mask.unsqueeze(1)
    reply_input_mask = reply_input_mask & subsequent_mask(reply_input.shape[-1]).type_as(reply_input_mask.data)
    reply_input_mask = reply_input_mask.unsqueeze(1)
    reply_target_mask = reply_target!=0

    return question_mask, reply_input_mask, reply_target_mask

In [None]:
class Embeddings(nn.Module):

    def __init__(self, vocab_size, d_model, max_len = 50):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = self.create_positional_encoding(max_len, self.d_model)

    def create_positional_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).cuda()

        for pos in range(max_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos/(1000**(2*i/d_model)))
                pe[pos, i+1] = math.cos(pos/(1000**(2*(i+1)/d_model)))

        return pe

    def forward(self, encoded_words):
        embedding = self.embed(encoded_words) * math.sqrt(self.d_model)
        embedding += self.pe[:embedding.size(1), :]
        embedding = self.dropout(embedding)

        return embedding

In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, heads, d_model):
        super(MultiHeadAttention, self).__init__()
        assert d_model % heads ==0

        self.d_k = d_model//heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)

        self.concat = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):

        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        key = key.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)

        scores = torch.matmul(query, key.permute(0, 1, 3, 2))/math.sqrt(query.shape[-1])
        scores = scores.masked_fill(mask == 0, -1e9)
        weights = F.softmax(scores, dim = -1)
        weights = self.dropout(weights)

        context = torch.matmul(weights, value)
        context = context.permute(0, 2, 1, 3).reshape(context.shape[0], -1, self.heads*self.d_k)

        interacted = self.concat(context)

        return interacted

In [None]:
class FeedForward(nn.Module):

    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()

        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):

        out = self.dropout(F.relu(self.fc1(x)))
        out = self.fc2(out)

        return out

In [None]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__()

        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)

        self.layernorm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, embeddings, mask):

        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted+embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(interacted+feed_forward_out)

        return encoded

In [None]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, heads):
        super(DecoderLayer, self).__init__()

        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.src_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, embeddings, encoded, src_mask, target_mask):

        query = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, target_mask))
        query = self.layernorm(query + embeddings)
        interacted = self.dropout(self.src_multihead(query, encoded, encoded, src_mask))
        interacted = self.layernorm(query + interacted)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        decoded = self.layernorm(feed_forward_out + interacted)

        return decoded

In [None]:
class Transformer(nn.Module):

    def __init__(self, d_model, heads, num_layers, word_map):
        super(Transformer, self).__init__()

        self.d_model = d_model
        self.vocab_size = len(word_map)

        self.embed = Embeddings(self.vocab_size, d_model)

        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads) for _ in range(num_layers)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, heads) for _ in range(num_layers)])

        self.logit = nn.Linear(d_model, self.vocab_size)

    def encode(self, src_words, src_mask):

        src_embeddings = self.embed(src_words)

        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)

        return src_embeddings

    def decode(self, target_words, target_mask, src_embeddings, src_mask):

        tgt_embeddings = self.embed(target_words)

        for layer in self.decoder:
            tgt_embeddings = layer(tgt_embeddings, src_embeddings, src_mask, target_mask)

        return tgt_embeddings

    def forward(self, src_words, src_mask, target_words, target_mask):

        encoded = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded, src_mask)

        out = F.log_softmax(self.logit(decoded), dim = 2)

        return out

In [None]:
class AdamWarmup:

    def __init__(self, model_size, warmup_steps, optimizer):

        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer

        self.current_step = 0
        self.lr = 0

    def get_lr(self):

        return self.model_size ** (-0.5) * min(self.current_step ** (-0.5), self.current_step * self.warmup_steps ** (-1.5))

    def step(self):

        self.current_step += 1
        lr = self.get_lr()

        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

        self.lr = lr
        self.optimizer.step()

In [None]:
class LossWithLS(nn.Module):

    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()

        self.criterion = nn.KLDivLoss(size_average = False, reduce = False)
        self.size = size
        self.smooth = smooth
        self.confidence = 1.0 - smooth

    def forward(self, prediction, target, mask):

        prediction = prediction.view(-1, prediction.size(-1))
        target = target.contiguous().view(-1)

        mask = mask.float()
        mask = mask.view(-1)

        labels = prediction.data.clone()
        labels.fill_(self.smooth/(self.size - 1))
        labels.scatter_(1, target.data.unsqueeze(1), self.confidence)

        loss = self.criterion(prediction, labels)

        loss = (loss.sum(1)*mask).sum() / mask.sum()

        return loss

In [None]:
d_model = 512
heads = 8
num_layers = 6
epochs = 10
model_size = d_model
warmup_steps = 4000


with open('/kaggle/input/transformer/WORDMAP_corpus.json', 'r') as j:
    word_map = json.load(j)

size = len(word_map)
smooth = 0.1

criterion = LossWithLS(size, smooth).cuda()

transformer = Transformer(d_model, heads, num_layers, word_map).cuda()

adam_optimizer = torch.optim.Adam(transformer.parameters(), lr = 0, betas = (0.9, 0.98), eps = 1e-9)
transformer_optimizer = AdamWarmup(model_size, warmup_steps, adam_optimizer )

In [None]:
def train(train_loader, transformer, criterion, epoch):

    transformer.train()

    sum_loss = 0
    count = 0

    for i, (question, reply) in enumerate(train_loader):

        samples = question.shape[0]

        question = question.cuda()
        reply = reply.cuda()

        reply_input = reply[:, :-1]
        reply_target = reply[:, 1:]

        question_mask, reply_input_mask, reply_target_mask = create_masks(question, reply_input, reply_target)

        out = transformer(question, question_mask, reply_input, reply_input_mask)

        loss = criterion(out, reply_target, reply_target_mask)

        transformer_optimizer.optimizer.zero_grad()
        loss.backward()
        transformer_optimizer.step()

        sum_loss += loss.item() * samples
        count += samples

        if i % 100 == 0:
            print('Epoch: {}, Iterations: {}/{}, Loss: {:.3f}'.format(epoch, i, len(train_loader), sum_loss/count))


In [None]:
def evaluate(transformer, question, question_mask, max_len, word_map):

    rev_word_map = {v: k for k, v in word_map.items()}

    transformer.eval()

    start_token = word_map['<start>']

    encoded = transformer.encode(question, question_mask)

    words = torch.LongTensor([[start_token]]).cuda()

    for step in range(max_len - 1):

        size = words.shape[0]

        target_mask = torch.tril(torch.ones(size, size)).type(dtype = torch.uint8).cuda()
        target_mask = target_mask.unsqueeze(0).unsqueeze(1)

        decoded = transformer.decode(words, target_mask, encoded, question_mask)

        prediction = transformer.logit(decoded[:, -1])
        _, next_word = torch.max(prediction, dim = -1)
        next_word = next_word.item()

        if next_word == word_map['<end>']:
            break
        words = torch.cat([words, torch.LongTensor([[next_word]]).cuda()], dim = -1)

    words = words.squeeze(0)
    words = words.tolist()

    sen_idx = [w for w in words if w != word_map['<start>']]
    sentence = " ".join([rev_word_map[w] for w in sen_idx])

    return sentence

In [None]:
checkpoint = torch.load('/kaggle/input/transformer/checkpoint9.pth')
transformer = checkpoint['transformer']
transformer_optimizer = checkpoint['transformer_optimizer']
epoch = checkpoint['epoch']

In [None]:
for epoch in range(epoch+1, epochs+1+epoch):
    train(train_loader, transformer, criterion, epoch)
    state = {'epoch': epoch, 'transformer': transformer, 'transformer_optimizer': transformer_optimizer}
    torch.save(state, 'checkpoint'+str(epoch)+'.pth')

In [None]:
    question = 'nothing'
    max_len = 21
    enc_ques = [word_map.get(word, word_map['<unk>']) for word in question.split()]
    question = torch.tensor(enc_ques).cuda().unsqueeze(0)
    question_mask = (question !=0).cuda().unsqueeze(1).unsqueeze(1)
    sentence = evaluate(transformer, question, question_mask, int(max_len), word_map)
    print(sentence)