In [None]:
pip install wandb

In [None]:
from io import open
import unicodedata
import string
import re
import random
import time
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import wandb
wandb.login()

In [None]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.letter2index = {}
        self.letter2count = {}
        self.index2letter = {0: "SOS", 1: "EOS"}
        self.n_letters = 2  # Count SOS and EOS

    def addletter(self, letter): # making a dictionary of letters and their counts
        if letter not in self.letter2index:
            self.letter2index[letter] = self.n_letters
            self.letter2count[letter] = 1
            self.index2letter[self.n_letters] = letter
            self.n_letters += 1
        else:
            self.letter2count[letter] += 1

    def addword(self, letter): # adding a word to the dictionary
        for letter in letter:
            self.addletter(letter)

    def decode(self, target):
        return ' '.join([self.index2letter[i.get] for i in target])


In [None]:
def readLang(lang1, lang2, reverse=False): # read the file and make a dictionary of words of both languages

    # Read the file and split into lines
    train_lines = open('/content/drive/MyDrive/hin_train.csv', encoding='utf-8').\
        read().strip().split('\n')
    val_lines = open('/content/drive/MyDrive/hin_valid.csv', encoding='utf-8').\
        read().strip().split('\n')
    test_lines = open('/content/drive/MyDrive/hin_test.csv', encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    train_pairs = [l.split(',') for l in train_lines]
    val_pairs = [l.split(',') for l in val_lines]
    test_pairs = [l.split(',') for l in test_lines]

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    for pair in train_pairs:
      input_lang.addword(pair[0])
      output_lang.addword(pair[1])
    
    for pair in val_pairs:
      input_lang.addword(pair[0])
      output_lang.addword(pair[1])
    for pair in test_pairs:
      input_lang.addword(pair[0])
      output_lang.addword(pair[1])

    return train_pairs, val_pairs, test_pairs, input_lang, output_lang

In [None]:
def indexesFromword(lang, word): # convert a word to a list of indexes
    return [lang.letter2index[letter] for letter in word]


def tensorFromword(lang, word): # convert a word to a tensor
    indexes = indexesFromword(lang, word)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair, input_lang, output_lang): # convert a pair of words to a pair of tensors
    input_tensor = tensorFromword(input_lang, pair[0])
    target_tensor = tensorFromword(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
class Encoder(nn.Module): # encoder class
    def __init__(self, type, input_size, emb_size, hidden_size, p, num_layers):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, emb_size)
        self.rnn = nn.RNN(emb_size, hidden_size, num_layers, dropout = p)
        self.gru = nn.GRU(emb_size, hidden_size, num_layers, dropout = p)
        self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, dropout = p)
        self.type_t = type

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1) #embedding of word
        embedded = self.dropout(embedded)
        output = embedded
        
        # giving output according to model type
        if self.type_t == 'RNN':
            output, hidden = self.rnn(output, hidden)
        elif self.type_t == 'GRU':
            output, hidden = self.gru(output, hidden)
        elif self.type_t == 'LSTM':
            output, hidden = self.lstm(output, hidden)

        return output, hidden
    
    def initHidden(self): # initializing hidden layer
        if self.type_t == 'LSTM':
            return (torch.zeros(self.num_layers, 1, self.hidden_size, device=device), torch.zeros(self.num_layers, 1, self.hidden_size, device=device))
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoder(nn.Module): # decoder class
    def __init__(self, type, output_size, hidden_size, p, num_layers):
        super(AttnDecoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(hidden_size*2, 50)
        self.attn_combine = nn.Linear(hidden_size*2, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, dropout = p)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, dropout = p)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, dropout = p)
        self.out = nn.Linear(hidden_size, output_size)
        self.type_t = type

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1) # embedding of word
        embedded = self.dropout(embedded)
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1) # attention weights
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)) # attention applied
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)

        # giving output according to model type
        if self.type_t == 'RNN':
            output, hidden = self.rnn(output, hidden)
        elif self.type_t == 'GRU':
            output, hidden = self.gru(output, hidden)
        elif self.type_t == 'LSTM':
            output, hidden = self.lstm(output, hidden)

        # softmax to get probabilities
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights
    
    def initHidden(self): # initializing hidden layer
        if self.type_t == 'LSTM':
            return (torch.zeros(self.num_layers, 1, self.hidden_size, device=device), torch.zeros(self.num_layers, 1, self.hidden_size, device=device))
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
class Train(): # training class
    def __init__(self, train_data, encoder, decoder, criterion, tfr = 0.5):
        self.train_data = train_data
        self.encoder = encoder
        self.decoder = decoder
        self.criterion = criterion
        self.tfr = tfr
        self.train_pairs, self.val_pairs, self.test_pairs, self.input_lang, self.output_lang = readLang('eng', 'hin')
        self.training_pairs = [tensorsFromPair(self.train_pairs[i], self.input_lang, self.output_lang) for i in range(len(self.train_pairs))]

    def train(self, input_tensor, target_tensor, encoder_optimizer, decoder_optimizer):
        encoder_hidden = self.encoder.initHidden()
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        encoder_outputs = torch.zeros(50, self.encoder.hidden_size, device=device)

        loss = 0

        input_length = input_tensor.size(0)
        target_length = target_tensor.size(0)

        for i in range(input_length): # encoding a word
            encoder_output, encoder_hidden = self.encoder(input_tensor[i], encoder_hidden)
            # print(encoder_output.shape)
            encoder_outputs[i] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden # encoder shares its hidden layer with decoder

        use_teacher_forcing = True if random.random() < self.tfr else False
        
        if use_teacher_forcing: 
            for i in range(target_length):
                decoder_output, decoder_hidden, decoder_attention = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
                loss += self.criterion(decoder_output, target_tensor[i])
                decoder_input = target_tensor[i] # teacher forcing

        else:
            for i in range(target_length):
                decoder_output, decoder_hidden, decoder_attention = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
                topv, topi = decoder_output.topk(1) # top k predictions
                decoder_input = topi.squeeze().detach() # detach from history as input
                loss += self.criterion(decoder_output, target_tensor[i])
                if decoder_input.item() == EOS_token: # if EOS token is predicted, stop
                    break

        loss.backward() 

        encoder_optimizer.step()
        decoder_optimizer.step()

        return loss.item() / target_length


    def trainIters(self, optimizer, learning_rate, n_iters = 69, print_every = 69, epochs=-1):
        start = time.time()
        print_loss_total = 0

        if optimizer == 'SGD':
            encoder_optimizer = optim.SGD(self.encoder.parameters(), lr = learning_rate)
            decoder_optimizer = optim.SGD(self.decoder.parameters(), lr = learning_rate)
        elif optimizer == 'Adam':
            encoder_optimizer = optim.Adam(self.encoder.parameters(), lr = learning_rate)
            decoder_optimizer = optim.Adam(self.decoder.parameters(), lr = learning_rate)
        elif optimizer == 'RMSprop':
            encoder_optimizer = optim.RMSprop(self.encoder.parameters(), lr = learning_rate)
            decoder_optimizer = optim.RMSprop(self.decoder.parameters(), lr = learning_rate)
        elif optimizer == 'NAdam':
            encoder_optimizer = optim.NAdam(self.encoder.parameters(), lr = learning_rate)
            decoder_optimizer = optim.NAdam(self.decoder.parameters(), lr = learning_rate)

        if epochs != -1: # if epochs are specified
            n_iters = len(self.train_pairs)
        else:
            train_loss_total = 0
            for iter in tqdm(range(1, n_iters+1)):
                training_pair = self.training_pairs[iter - 1]
                input_tensor = training_pair[0]
                target_tensor = training_pair[1]
                loss = self.train(input_tensor, target_tensor, encoder_optimizer, decoder_optimizer)
                train_loss_total += loss

                if iter % print_every == 0:
                    print_loss_avg = print_loss_total / print_every
                    print_loss_total = 0
                    print('%s (%d %d%%) %.4f' % (timeSince(start, iter/n_iters), iter, iter/n_iters*100, print_loss_avg))
            train_acc = self.evaluateData(self.train_pairs) #evaluating the model on train pairs
            valid_acc = self.evaluateData(self.val_pairs) # evaluating the model on validation pairs
            return train_acc, valid_acc

        train_losss = []
        valid_accs = []
        train_accs = []
        for j in range(epochs):
            train_loss_total = 0
            for iter in tqdm(range(1, n_iters+1)):
                training_pair = self.training_pairs[iter - 1]
                input_tensor = training_pair[0]
                target_tensor = training_pair[1]
                loss = self.train(input_tensor, target_tensor, encoder_optimizer, decoder_optimizer)
                train_loss_total += loss
                print_loss_total += loss

                if iter % print_every == 0:
                    print_loss_avg = print_loss_total / print_every
                    print_loss_total = 0
                    print('%s (%d %d%%) %.4f' % (timeSince(start, iter/n_iters), iter, iter/n_iters*100, print_loss_avg))
            train_acc = self.evaluateData(self.train_pairs)
            valid_acc = self.evaluateData(self.val_pairs)
            train_losss.append(train_loss_total / n_iters)
            valid_accs.append(valid_acc)
            train_accs.append(train_acc)
            print({'train_loss': train_loss_total / n_iters, 'train_acc': train_acc, 'valid_acc': valid_acc})
            wandb.log({'train_loss': train_loss_total / n_iters, 'train_acc': train_acc, 'valid_acc': valid_acc})
        return train_losss, train_accs, valid_accs
                    

    def evaluate(self, word):
        with torch.no_grad():
            input_tensor = tensorFromword(self.input_lang, word)
            input_length = input_tensor.size()[0]
            encoder_hidden = self.encoder.initHidden()

            encoder_outputs = torch.zeros(50, self.encoder.hidden_size, device=device)

            for i in range(input_length): # encoding a word
                encoder_output, encoder_hidden = self.encoder(input_tensor[i], encoder_hidden)
                # print(encoder_output.shape)
                encoder_outputs[i] += encoder_output[0, 0]

            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = encoder_hidden # encoder shares its hidden layer with decoder

            decoded_word = ''
            decoder_attentions = torch.zeros(50, 50)

            for j in range(50):
                decoder_output, decoder_hidden, decoder_attention = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
                decoder_attentions[j] = decoder_attention.data
                topv, topi = decoder_output.topk(1) # top k predictions
                if topi.item() == EOS_token:
                    break
                else:
                    decoded_word += (self.output_lang.index2letter[topi.item()])
                decoder_input = topi.squeeze().detach() # detach from history as input

            return decoded_word, decoder_attentions[:j+1]
        
    def evaluateData(self, data):
        acc = 0
        for word,target in data:
            output_word, attentions = self.evaluate(word)
            acc += (output_word == target)
        return acc / len(data)
            

In [None]:
train_pairs, val_pairs, test_pairs, input_lang, output_lang = readLang('eng', 'hin')

# encoder = Encoder('GRU', input_lang.n_letters, 512, 512, 0, 1).to(device)
# decoder = AttnDecoder('GRU', output_lang.n_letters, 512, 0, 1).to(device)

In [None]:
# train = Train(train_pairs, encoder, decoder, nn.NLLLoss())
# train.trainIters('SGD', 0.01, print_every=1000, epochs=10)

In [None]:
sweep_config = {
    'method': 'random', 
    'metric': {
        'name': 'valid_acc',
        'goal': 'maximize' # goal is to maximize the validation accuracy
    },
    'parameters': {
        'optimizer': {
            'values': ['SGD', 'Adam', 'RMSprop']
        },
        'learning_rate': {
            'values': [1e-4, 5e-4, 0.001, 0.005]
        },
        'epochs': {
            'values': [10]
        },
        'hid_layers': {
            'values': [1]
        },
        'emb_size': {
            'values': [64, 128, 256, 512]
        },
        'hidden_size': {
            'values': [64, 128, 256, 512]
        },
        'dropout': {
            'values': [0, 0.1, 0.2, 0.3]
        },
        'type_t': {
            'values': ['RNN', 'LSTM', 'GRU']
        }
    }
}


In [None]:
def run():
    # Default values for hyper-parameters
    config_defaults = {
        'optimizer': 'Adam',
        'learning_rate': 0.005,
        'epochs': 10,
        'hid_layers': 1,
        'emb_size': 256,
        'hidden_size': 256,
        'dropout': 0.1,
        'type_t': 'GRU'
    }
    wandb.init(config=config_defaults) # Initialize a new wandb run
    config = wandb.config # config saves hyperparameters and inputs
    encoder = Encoder(config.type_t, input_lang.n_letters, config.emb_size, config.hidden_size, config.dropout, config.hid_layers).to(device)
    decoder = AttnDecoder(config.type_t, output_lang.n_letters, config.hidden_size, config.dropout, config.hid_layers).to(device)
    train = Train(train_pairs, encoder, decoder, nn.NLLLoss())
    train.trainIters(config.optimizer, config.learning_rate,print_every= 1000, epochs=config.epochs)

    wandb.finish()




sweep_id = wandb.sweep(sweep_config, project='assignment-3-attention')
wandb.agent(sweep_id, function=run, count=10)