<a href="https://colab.research.google.com/github/JayantJharkhande3000/CS6910_Assignment_03_EtoH/blob/main/Question_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential,Model,load_model
from keras.layers import Dense,LSTM,GRU,SimpleRNN,Input,Dropout,TimeDistributed,RepeatVector,dot,BatchNormalization,concatenate,multiply,Activation
from keras.layers.embeddings import Embedding
from keras.layers import Layer
from keras.preprocessing import sequence
from tensorflow.keras.optimizers import Adam,Adadelta,Nadam,SGD
from keras.losses import SparseCategoricalCrossentropy

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_data_path = "/content/drive/MyDrive/DATA/lexicons/hi.translit.sampled.train.tsv"
test_data_path = "/content/drive/MyDrive/DATA/lexicons/hi.translit.sampled.test.tsv"
validation_data_path = "/content/drive/MyDrive/DATA/lexicons/hi.translit.sampled.dev.tsv"

In [13]:
class English_to_Hindi_Attention(Layer):
  def __init__(self, units):
    super(English_to_Hindi_Attention, self).__init__()
    self.W1 = Dense(units)
    self.W2 = Dense(units)
    self.V = Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))
    
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = tf.reduce_sum((attention_weights * values), axis=1)

    return context_vector, attention_weights

In [14]:
class Encoder(Model):
  def __init__(self,cell,vocab_size, embedding_dim, latent_dim, batch_size,initializer,dropouts):
    super(Encoder, self).__init__()
    self.cell = cell
    self.batch_size = batch_size
    self.latent_dim = latent_dim
    self.embedding = Embedding(vocab_size, embedding_dim)
    if cell == "gru":
        self.gru = GRU(latent_dim,return_sequences=True,return_state=True,recurrent_initializer = initializer,dropout=dropouts)
    elif cell == "lstm":
        self.lstm = LSTM(latent_dim,return_sequences=True,return_state=True,recurrent_initializer = initializer,dropout=dropouts)
    elif cell == "rnn":
        self.rnn = SimpleRNN(latent_dim,return_sequences=True,return_state=True,recurrent_initializer = initializer,dropout=dropouts)


  def call(self, x, hidden):
    x = self.embedding(x)
    if self.cell == "gru":
        output, state = self.gru(x, initial_state=hidden)
    elif self.cell == "lstm":
        output, state, state_c= self.lstm(x, initial_state=hidden)
    elif self.cell == "rnn":
        output, state = self.rnn(x, initial_state=hidden)
    return output, state

  def initialize_hidden_state(self):
      if self.cell == 'lstm':
          return [tf.zeros((self.batch_size, self.latent_dim)),tf.zeros((self.batch_size, self.latent_dim))]
      return tf.zeros((self.batch_size, self.latent_dim))

In [15]:
class Decoder(Model):
  def __init__(self, cell, vocab_size, embedding_dim, latent_dim, batch_size,initializer,dropouts):
    super(Decoder, self).__init__()
    self.cell = cell
    self.batch_size = batch_size
    self.attention = English_to_Hindi_Attention(latent_dim)
    self.embedding = Embedding(vocab_size, embedding_dim)
    self.dense = Dense(vocab_size)
    if cell == "gru":
        self.gru = GRU(latent_dim,return_sequences=True,return_state=True,recurrent_initializer = initializer,dropout=dropouts)
    elif cell == "lstm":
        self.lstm = LSTM(latent_dim,return_sequences=True,return_state=True,recurrent_initializer = initializer,dropout=dropouts)
    elif cell == "rnn":
        self.rnn = SimpleRNN(latent_dim,return_sequences=True,return_state=True,recurrent_initializer = initializer,dropout=dropouts)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)

    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    if self.cell == "gru":
        output, state = self.gru(x)
    elif self.cell == "lstm":
        output, state,state_c = self.lstm(x)
    elif self.cell == "rnn":
        output, state = self.rnn(x)

    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.dense(output)

    return x, state, attention_weights

In [16]:
class Attention:
    def __init__(self,cell,embedding_size,latent_dim,optimizer,dropouts,batch_size,epochs,initializer):
        self.cell = cell
        self.embedding_dim = embedding_size
        self.latent_dim = latent_dim
        self.BATCH_SIZE = batch_size
        self.epochs = epochs
        self.opt = optimizer
        self.dropouts=dropouts
        self.initializer=initializer

    @tf.function()    
    def train_step(self, inp, targ, enc_hidden):
        loss = 0
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = self.encoder(inp, enc_hidden)
            dec_hidden = enc_hidden
            dec_input = tf.expand_dims([self.input_token_index['\t']] * self.BATCH_SIZE, 1)

            for t in range(1, targ.shape[1]):
                predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
                loss += self.loss_function(targ[:, t], predictions)
                dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = (loss / int(targ.shape[1]))
        variables = self.encoder.trainable_variables + self.decoder.trainable_variables + self.decoder.attention.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        return batch_loss

    def get_data(self,path):
        d = pd.read_csv(path,sep="\t",header=None,error_bad_lines=False)
        d = d.dropna()

        decoder_target_data = np.zeros((d.shape[0],self.max_length_y,self.decoder_tokens), dtype="float32")

        for i,target_text in enumerate(d[0]):
            target_text = '\t'+target_text+'\n'
            for t, char in enumerate(target_text):
                if t > 0:
                    decoder_target_data[i, t - 1, self.target_token_index[char]] = 1.0
            decoder_target_data[i, t:, self.target_token_index["\n"]] = 1.0

        return ([[self.input_token_index[letter] for letter in list('\t'+word+'\n')] for word in d[1]]),\
                ([[self.target_token_index[letter] for letter in list('\t'+word+'\n')] for word in d[0]]),decoder_target_data

    def create_vocab(self,path):
        d = pd.read_csv(path,sep="\t",header=None,error_bad_lines=False)
        d = d.dropna()

        x = [list('\t'+word+'\n') for word in np.array(d[1])]
        y = [list('\t'+word+'\n') for word in np.array(d[0])]

        hindi_vocab = set()
        english_vocab = set()

        for word in x:
            for char in word:
                english_vocab.add(char)

        for word in y:
            for char in word:
                hindi_vocab.add(char)

        hindi_list = sorted(list(hindi_vocab))
        english_list = sorted(list(english_vocab))

        max_length_x = (np.max([len(i) for i in x]))
        max_length_y = (np.max([len(i) for i in y]))

        return hindi_list,english_list,max_length_x,max_length_y    

    def create_data(self):
        train_path = "/content/drive/MyDrive/DATA/lexicons/hi.translit.sampled.train.tsv"
        cv_path = "/content/drive/MyDrive/DATA/lexicons/hi.translit.sampled.dev.tsv"
        test_path = "/content/drive/MyDrive/DATA/lexicons/hi.translit.sampled.test.tsv"

        hindi_list,english_list,self.max_length_x,self.max_length_y = self.create_vocab(train_path)
        self.encoder_tokens = len(english_list)
        self.decoder_tokens = len(hindi_list)

        # Dict for char to index
        self.input_token_index = dict([(char, i) for i, char in enumerate(english_list)])
        self.target_token_index = dict([(char, i) for i, char in enumerate(hindi_list)])

        # Dict for index to char
        self.inv_input_token_index = dict({(value,key) for key,value in self.input_token_index.items()})
        self.inv_target_token_index = dict({(value,key) for key,value in self.target_token_index.items()})

        encoder_train,decoder_train,self.decoder_target_train = self.get_data(train_path)
        encoder_cv,decoder_cv,self.decoder_target_cv = self.get_data(cv_path)
        encoder_test,decoder_test,self.decoder_target_test = self.get_data(test_path)


        self.encoder_train = sequence.pad_sequences(encoder_train,maxlen=self.max_length_x,padding="post")
        self.decoder_train = sequence.pad_sequences(decoder_train,maxlen=self.max_length_y,padding="post")
        self.encoder_cv = sequence.pad_sequences(encoder_cv,maxlen=self.max_length_x,padding="post")
        self.decoder_cv = sequence.pad_sequences(decoder_cv,maxlen=self.max_length_y,padding="post")
        self.encoder_test = sequence.pad_sequences(encoder_test,maxlen=self.max_length_x,padding="post")
        self.decoder_test = sequence.pad_sequences(decoder_test,maxlen=self.max_length_y,padding="post")

        self.BUFFER_SIZE = len(self.encoder_train)        
        self.steps_per_epoch = len(self.encoder_train)//self.BATCH_SIZE

        self.dataset = tf.data.Dataset.from_tensor_slices((self.encoder_train, self.decoder_train)).shuffle(self.BUFFER_SIZE)
        self.dataset = self.dataset.batch(self.BATCH_SIZE, drop_remainder=True)


    def loss_function(self,real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = self.loss_object(real, pred)
        loss_ *= tf.cast(mask, dtype=loss_.dtype)

        return tf.reduce_mean(loss_)

    def run(self):
        # Compile & run training
        if self.opt == "nadam":
            self.optimizer = Nadam()
        elif self.opt == "sgd":
            self.optimizer = SGD()
        elif self.opt == "adadelta":
            self.optimizer = Adadelta()
        else:
            self.optimizer = Adam()

        self.loss_object = SparseCategoricalCrossentropy(from_logits=True,reduction='none')
        
        self.encoder = Encoder(self.cell,self.encoder_tokens, self.embedding_dim, self.latent_dim, self.BATCH_SIZE, self.initializer,self.dropouts)
        self.decoder = Decoder(self.cell,self.decoder_tokens, self.embedding_dim, self.latent_dim, self.BATCH_SIZE, self.initializer,self.dropouts)

        for epoch in range(self.epochs):
            enc_hidden = self.encoder.initialize_hidden_state()
            total_loss = 0
            
            for (batch, (inp, targ)) in enumerate(self.dataset.take(self.steps_per_epoch)):
                batch_loss = self.train_step(inp, targ, enc_hidden)
                total_loss += batch_loss 

            print(f'Epoch {epoch+1} Loss {total_loss/self.steps_per_epoch:.4f}   ')        



    def evaluate(self,sentence_vect,attention=False):
        if attention:
            att_plot = np.zeros((self.max_length_y,self.max_length_x))
        inputs = tf.convert_to_tensor(sentence_vect)
        inputs = tf.expand_dims(inputs,0)
        result = ''
        if self.cell == "lstm":
            hidden = [tf.zeros((1, self.latent_dim)),tf.zeros((1, self.latent_dim))]
        else:
            hidden = [tf.zeros((1, self.latent_dim))]
        enc_out, enc_hidden = self.encoder(inputs, hidden)

        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([self.target_token_index['\t']], 0)

        for t in range(self.max_length_y):
            predictions, dec_hidden, attention_weights = self.decoder(dec_input, dec_hidden, enc_out)
            
            if attention:
                att_plot[t] = (tf.reshape(attention_weights,(-1,))).numpy()

            predicted_id = tf.argmax(predictions[0]).numpy()

            if self.inv_target_token_index[predicted_id] != "\n":
                result += self.inv_target_token_index[predicted_id]
            else:
                if attention:
                    return result,att_plot
                return result

            dec_input = tf.expand_dims([predicted_id], 0) 

        if attention:
            return result,att_plot
        return result

    def percentage_of_correct_test_predictions(self):
        count = 0
        for i in range(len(self.decoder_test)):
            actual = ""
            for x in self.decoder_test[i][1:]:
                if self.inv_target_token_index[x]=="\n":
                    break
                actual += self.inv_target_token_index[x]

            pred = self.evaluate(self.encoder_test[i])
            if (actual==pred):
                count+=1
                
        return count/len(self.decoder_test)

    def percentage_of_correct_cv_predictions(self):
        count = 0
        for i in range(len(self.decoder_cv)):
            actual = ""
            for x in self.decoder_cv[i][1:]:
                if self.inv_target_token_index[x]=="\n":
                    break
                actual += self.inv_target_token_index[x]

            pred = self.evaluate(self.encoder_cv[i])
            if (actual==pred):
                count+=1

        return count/len(self.decoder_cv)


In [9]:
!pip install wandb --upgrade

Collecting wandb
  Downloading wandb-0.12.16-py2.py3-none-any.whl (1.8 MB)
[?25l[K     |▏                               | 10 kB 28.4 MB/s eta 0:00:01[K     |▍                               | 20 kB 31.2 MB/s eta 0:00:01[K     |▌                               | 30 kB 33.5 MB/s eta 0:00:01[K     |▊                               | 40 kB 17.0 MB/s eta 0:00:01[K     |█                               | 51 kB 14.5 MB/s eta 0:00:01[K     |█                               | 61 kB 16.4 MB/s eta 0:00:01[K     |█▎                              | 71 kB 16.0 MB/s eta 0:00:01[K     |█▌                              | 81 kB 15.3 MB/s eta 0:00:01[K     |█▋                              | 92 kB 16.6 MB/s eta 0:00:01[K     |█▉                              | 102 kB 17.0 MB/s eta 0:00:01[K     |██                              | 112 kB 17.0 MB/s eta 0:00:01[K     |██▏                             | 122 kB 17.0 MB/s eta 0:00:01[K     |██▍                             | 133 kB 17.0 MB/s eta

In [10]:
import wandb
from wandb.keras import WandbCallback

In [11]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [17]:
wandb.init(project="Question5", entity="jharkhandejayant")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [18]:
sweep_config={
    'method': 'random',
    'metric': {
        'name': 'accuracy',
        'goal': 'maximize'
    },
    'parameters':{
        'epochs':{
            'values':[3,5]
        },
        'embedding_size':{
            'values':[8,12,16,20]
        },
        'cell':{
            'values':["gru","lstm","rnn"]
        },
        'dropouts':{
            'values':[0,0.2,0.3]
        },
        'latent_dim':{
            'values':[16,32,64,128]
        },
        'batch_size':{
            'values':[32,64]
        },
        'optimizers':{
            'values':["nadam","adam","sgd","adadelta"]
        },
       'initializer':{
            'values':["orthogonal","glorot_uniform"]
        }
    }
}

In [20]:
sweep_id = wandb.sweep(sweep_config, project="Question5", entity="jharkhandejayant")


Create sweep with ID: 3qngqjc6
Sweep URL: https://wandb.ai/jharkhandejayant/Question5/sweeps/3qngqjc6


In [21]:
def train():
    config_defaults={
      'epochs':5,
      'embedding_size':16,
      'dropouts':0.3,
      'optimizers':"adam",
      'cell':"lstm",
      'latent_dim':64,
      'batch_size': 64,
      'initializer':"glorot_uniform"
       }
    
    wandb.init(config=config_defaults)
    config=wandb.config
    English_To_Hindi_translation=Attention(config.cell,config.embedding_size,config.latent_dim,config.optimizers,config.dropouts,config.batch_size,config.epochs,config.initializer)
    English_To_Hindi_translation.create_data()
    English_To_Hindi_translation.run()
    val_acc=English_To_Hindi_translation.percentage_of_correct_cv_predictions()
    wandb.log({'validation accuracy':val_acc})


In [22]:
wandb.agent(sweep_id,train)


[34m[1mwandb[0m: Agent Starting Run: t72nobgm with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell: gru
[34m[1mwandb[0m: 	dropouts: 0
[34m[1mwandb[0m: 	embedding_size: 8
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	initializer: orthogonal
[34m[1mwandb[0m: 	latent_dim: 64
[34m[1mwandb[0m: 	optimizers: adadelta












Epoch 1 Loss 1.4440   
Epoch 2 Loss 1.4415   
Epoch 3 Loss 1.4381   


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
validation accuracy,▁

0,1
validation accuracy,0.0


[34m[1mwandb[0m: Agent Starting Run: cr2quijt with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell: lstm
[34m[1mwandb[0m: 	dropouts: 0
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	initializer: glorot_uniform
[34m[1mwandb[0m: 	latent_dim: 128
[34m[1mwandb[0m: 	optimizers: adam


Epoch 1 Loss 0.9588   
Epoch 2 Loss 0.7183   
Epoch 3 Loss 0.4190   
Epoch 4 Loss 0.2848   
Epoch 5 Loss 0.2300   


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
validation accuracy,▁

0,1
validation accuracy,0.27994


[34m[1mwandb[0m: Agent Starting Run: 3momfzjf with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell: rnn
[34m[1mwandb[0m: 	dropouts: 0
[34m[1mwandb[0m: 	embedding_size: 12
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	initializer: orthogonal
[34m[1mwandb[0m: 	latent_dim: 64
[34m[1mwandb[0m: 	optimizers: nadam


Epoch 1 Loss 0.8039   
Epoch 2 Loss 0.4716   
Epoch 3 Loss 0.3304   
Epoch 4 Loss 0.2847   
Epoch 5 Loss 0.2600   


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
validation accuracy,▁

0,1
validation accuracy,0.23313


[34m[1mwandb[0m: Agent Starting Run: 0crwpmci with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell: gru
[34m[1mwandb[0m: 	dropouts: 0.3
[34m[1mwandb[0m: 	embedding_size: 12
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	initializer: glorot_uniform
[34m[1mwandb[0m: 	latent_dim: 128
[34m[1mwandb[0m: 	optimizers: sgd


Epoch 1 Loss 1.2020   
Epoch 2 Loss 1.1117   
Epoch 3 Loss 1.0147   


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
validation accuracy,▁

0,1
validation accuracy,0.0


[34m[1mwandb[0m: Agent Starting Run: 3eku3hbi with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell: rnn
[34m[1mwandb[0m: 	dropouts: 0
[34m[1mwandb[0m: 	embedding_size: 20
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	initializer: glorot_uniform
[34m[1mwandb[0m: 	latent_dim: 32
[34m[1mwandb[0m: 	optimizers: sgd


Epoch 1 Loss 1.1955   
Epoch 2 Loss 1.1201   
Epoch 3 Loss 1.0335   


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
validation accuracy,▁

0,1
validation accuracy,0.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3yoxesef with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell: lstm
[34m[1mwandb[0m: 	dropouts: 0.3
[34m[1mwandb[0m: 	embedding_size: 16
[34m[1mwandb[0m: 	epochs: 3
[34m[1mwandb[0m: 	initializer: glorot_uniform
[34m[1mwandb[0m: 	latent_dim: 64
[34m[1mwandb[0m: 	optimizers: adadelta


Epoch 1 Loss 1.4446   


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
