<a href="https://www.kaggle.com/code/jaswanth431/dl-assignment-3-atten?scriptVersionId=177735535" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [31]:
import torch
from torch import nn
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import copy
from torch.utils.data import Dataset, DataLoader
import gc
import random
import wandb

In [32]:
wandb.login(key="62cfafb7157dfba7fdd6132ac9d757ccd913aaaf")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
END_TOKEN = '>'
START_TOKEN = '<'
PAD_TOKEN = '_'
TEACHER_FORCING_RATIO = 0.5

train_csv = "/kaggle/input/aksh11/aksharantar_sampled/tel/tel_train.csv"
test_csv = "/kaggle/input/aksh11/aksharantar_sampled/tel/tel_test.csv"
val_csv = "/kaggle/input/aksh11/aksharantar_sampled/tel/tel_valid.csv"

train_df = pd.read_csv(train_csv, header=None)
test_df = pd.read_csv(test_csv, header=None)
val_df = pd.read_csv(val_csv, header=None)
train_source, train_target = train_df[0].to_numpy(), train_df[1].to_numpy();
val_source, val_target = val_df[0].to_numpy(), val_df[1].to_numpy();

cuda


In [34]:
def add_padding(source_data, MAX_LENGTH):
    padded_source_strings = []
    for i in range(len(source_data)):
        source_str =START_TOKEN+ source_data[i] + END_TOKEN
        # Truncate or pad source sequence
        source_str = source_str[:MAX_LENGTH]
        source_str += PAD_TOKEN * (MAX_LENGTH - len(source_str))

        padded_source_strings.append(source_str)
        
    return padded_source_strings


def generate_string_to_sequence(source_data, source_char_index_dict):
    source_sequences = []
    for i in range(len(source_data)):
        source_sequences.append(get_chars(source_data[i], source_char_index_dict))
    source_sequences = pad_sequence(source_sequences, batch_first=True, padding_value=2)
    return source_sequences


def get_chars(str, char_index_dict):
    chars_indexes = []
    for ch in str:
        chars_indexes.append(char_index_dict[ch])
    return torch.tensor(chars_indexes, device=device)


def preprocess_data(source_data, target_data):
    data = {
        "source_chars": [START_TOKEN, END_TOKEN, PAD_TOKEN],
        "target_chars": [START_TOKEN, END_TOKEN, PAD_TOKEN],
        "source_char_index": {START_TOKEN: 0, END_TOKEN:1, PAD_TOKEN:2},
        "source_index_char": {0:START_TOKEN, 1: END_TOKEN, 2:PAD_TOKEN},
        "target_char_index": {START_TOKEN: 0, END_TOKEN:1, PAD_TOKEN:2},
        "target_index_char": {0:START_TOKEN, 1: END_TOKEN, 2:PAD_TOKEN},
        "source_len": 3,
        "target_len": 3,
        "source_data": source_data,
        "target_data": target_data,
        "source_data_seq": [],
        "target_data_seq": []
    }
    
    data["INPUT_MAX_LENGTH"] = max(len(string) for string in source_data) +2
    data["OUTPUT_MAX_LENGTH"] = max(len(string) for string in target_data)+2

    
    padded_source_strings=add_padding(source_data, data["INPUT_MAX_LENGTH"])
    padded_target_strings = add_padding(target_data, data["OUTPUT_MAX_LENGTH"])
    
    for i in range(len(padded_source_strings)):
        for c in padded_source_strings[i]:
            if data["source_char_index"].get(c) is None:
                data["source_chars"].append(c)
                idx = len(data["source_chars"]) - 1
                data["source_char_index"][c] = idx
                data["source_index_char"][idx] = c
        for c in padded_target_strings[i]:
            if data["target_char_index"].get(c) is None:
                data["target_chars"].append(c)
                idx = len(data["target_chars"]) - 1
                data["target_char_index"][c] = idx
                data["target_index_char"][idx] = c

    data['source_data_seq'] = generate_string_to_sequence(padded_source_strings,  data['source_char_index'])
    data['target_data_seq'] = generate_string_to_sequence(padded_target_strings,  data['target_char_index'])
#     print(data["source_data"][0])
#     print(data["source_data_seq"][0])
#     print(data["target_data"][0])
#     print(data["target_data_seq"][0])

    
    data["source_len"] = len(data["source_chars"])
    data["target_len"] = len(data["target_chars"])
    
    return data

# data = preprocess_data(copy.copy(train_source), copy.copy(train_target))

In [35]:
def get_cell_type(cell_type):
    if(cell_type == "RNN"):
        return nn.RNN
    elif(cell_type == "LSTM"):
        return nn.LSTM
    elif(cell_type == "GRU"):
        return nn.GRU
    else:
        print("Specify correct cell type")
        
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze().unsqueeze(1)
        weights = F.softmax(scores, dim=0)
        weights = weights.permute(2,1,0)
        keys = keys.permute(1,0,2)
        context = torch.bmm(weights, keys)
        return context, weights


class Encoder(nn.Module):
    def __init__(self, h_params, data, device ):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(data["source_len"], h_params["char_embd_dim"])
        self.cell = get_cell_type(h_params["cell_type"])(h_params["char_embd_dim"], h_params["hidden_layer_neurons"],num_layers=h_params["number_of_layers"], batch_first=True)
        self.device=device
        self.h_params = h_params
        self.data = data
    def forward(self, input , encoder_curr_state):
        input_length = self.data["INPUT_MAX_LENGTH"]
        batch_size = self.h_params["batch_size"]
        hidden_neurons = self.h_params["hidden_layer_neurons"]
        layers = self.h_params["number_of_layers"]
        encoder_states  = torch.zeros(input_length, layers, batch_size, hidden_neurons, device=device )
        for i in range(input_length):
            current_input = input[:, i].view(batch_size,1)
            _, encoder_curr_state = self.forward_step(current_input, encoder_curr_state)
            if self.h_params["cell_type"] == "LSTM":
                encoder_states[i] = encoder_curr_state[1]
            else:
                encoder_states[i] = encoder_curr_state
        return encoder_states, encoder_curr_state
    
    def forward_step(self, current_input, prev_state):
        embd_input = self.embedding(current_input)
#         print(embd_input.shape, prev_state.shape)
        output, prev_state = self.cell(embd_input, prev_state)
        return output, prev_state
        
    def getInitialState(self):
        return torch.zeros(self.h_params["number_of_layers"],self.h_params["batch_size"],self.h_params["hidden_layer_neurons"], device=self.device)

    
class Decoder(nn.Module):
    def __init__(self, h_params, data,device):
        super(Decoder, self).__init__()
        self.attention = Attention(h_params["hidden_layer_neurons"]).to(device)
        self.embedding = nn.Embedding(data["target_len"], h_params["char_embd_dim"])
        self.cell = get_cell_type(h_params["cell_type"])(h_params["hidden_layer_neurons"] +h_params["char_embd_dim"], h_params["hidden_layer_neurons"],num_layers=h_params["number_of_layers"], batch_first=True)
        self.fc = nn.Linear(h_params["hidden_layer_neurons"], data["target_len"])
        self.softmax = nn.LogSoftmax(dim=2)
        self.h_params = h_params
        self.data = data
        self.device = device

    def forward(self, decoder_current_state, encoder_final_layers, target_batch, loss_fn, teacher_forcing_enabled=True):
#         print("Teacher forcing:", teacher_forcing_enabled)
        batch_size = self.h_params["batch_size"]
        decoder_current_input = torch.full((batch_size,1),self.data["target_char_index"][START_TOKEN], device=self.device)
        embd_input = self.embedding(decoder_current_input)
        curr_embd = F.relu(embd_input)
        decoder_actual_output = []
        attentions = []
        loss = 0
        
        use_teacher_forcing = False
        if(teacher_forcing_enabled):
            use_teacher_forcing = True if random.random() < TEACHER_FORCING_RATIO else False
        for i in range(self.data["OUTPUT_MAX_LENGTH"]):
            decoder_output, decoder_current_state, attn_weights = self.forward_step(decoder_current_input, decoder_current_state, encoder_final_layers)
            attentions.append(attn_weights)
            topv, topi = decoder_output.topk(1)
            decoder_current_input = topi.squeeze().detach()
            decoder_actual_output.append(decoder_current_input)

            if(target_batch==None):
                decoder_current_input = decoder_current_input.view(self.h_params["batch_size"], 1)
            else:
                curr_target_chars = target_batch[:, i]
                if(i<self.data["OUTPUT_MAX_LENGTH"]-1):
                    if use_teacher_forcing:
                        decoder_current_input = target_batch[:, i+1].view(self.h_params["batch_size"], 1)
                    else:
                        decoder_current_input = decoder_current_input.view(self.h_params["batch_size"], 1)
                decoder_output = decoder_output[:, -1, :]
                loss+=(loss_fn(decoder_output, curr_target_chars))

        decoder_actual_output = torch.cat(decoder_actual_output,dim=0).view(self.data["OUTPUT_MAX_LENGTH"], self.h_params["batch_size"]).transpose(0,1)

        correct = (decoder_actual_output == target_batch).all(dim=1).sum().item()
        return decoder_actual_output, attentions, loss, correct
    
    def forward_step(self, current_input, prev_state, encoder_final_layers):
        embd_input = self.embedding(current_input)
        if self.h_params["cell_type"] == "LSTM":
            context , attn_weights = self.attention(prev_state[1][-1,:,:], encoder_final_layers)
        else:
            context , attn_weights = self.attention(prev_state[-1,:,:], encoder_final_layers)
        curr_embd = F.relu(embd_input)
        input_gru = torch.cat((curr_embd, context), dim=2)
        output, prev_state = self.cell(input_gru, prev_state)
        output = self.softmax(self.fc(output))
        return output, prev_state, attn_weights  

In [36]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.source_data_seq = data[0]
        self.target_data_seq = data[1]
    
    def __len__(self):
        return len(self.source_data_seq)
    
    def __getitem__(self, idx):
        source_data = self.source_data_seq[idx]
        target_data = self.target_data_seq[idx]
        return source_data, target_data


In [37]:
   
def evaluate(encoder, decoder, data, dataloader, device, h_params, loss_fn, use_teacher_forcing = False):
    correct_predictions = 0
    total_loss = 0
    total_predictions = len(dataloader.dataset)
    number_of_batches = len(dataloader)
    for batch_num, (source_batch, target_batch) in enumerate(dataloader):
        
        encoder_initial_state = encoder.getInitialState()
        if h_params["cell_type"] == "LSTM":
            encoder_initial_state = (encoder_initial_state, encoder.getInitialState())
        encoder_states, encoder_final_state = encoder(source_batch,encoder_initial_state)

        decoder_current_state = encoder_final_state
        encoder_final_layer_states = encoder_states[:, -1, :, :]

        loss = 0
        correct = 0

        decoder_output, attentions, loss, correct = decoder(decoder_current_state, encoder_final_layer_states, target_batch, loss_fn, use_teacher_forcing)
        if(batch_num == 0):
                for j in range(20):
                    print(make_strings(data,source_batch[j],target_batch[j],decoder_output[j]))
      
        correct_predictions+=correct
        total_loss +=loss
    
    accuracy = correct_predictions / total_predictions
    total_loss /= number_of_batches
    
    return accuracy, total_loss


In [38]:
def make_strings(data, source, target, output):
    source_string = ""
    target_string = ""
    output_string = ""
#     print(output)
    for i in source:
#         print(i.item())
        source_string+=(data['source_index_char'][i.item()])
    for i in target:
        target_string+=(data['target_index_char'][i.item()])
    for i in output:
        output_string+=(data['target_index_char'][i.item()])
    return source_string, target_string, output_string
                        

def train_loop(encoder, decoder,h_params, data, data_loader, device, val_dataloader, use_teacher_forcing=True):
    
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=h_params["learning_rate"])
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=h_params["learning_rate"])
    
    loss_fn = nn.NLLLoss()
    
    total_predictions = len(data_loader.dataset)
    total_batches = len(data_loader)
    
    for ep in range(h_params["epochs"]):
        total_correct = 0
        total_loss = 0
        for batch_num, (source_batch, target_batch) in enumerate(data_loader):
#             if(batch_num>0):
#                 break
            encoder_initial_state = encoder.getInitialState()
            
            if h_params["cell_type"] == "LSTM":
                encoder_initial_state = (encoder_initial_state, encoder.getInitialState())
            encoder_states, encoder_final_state = encoder(source_batch,encoder_initial_state)
            
            decoder_current_state = encoder_final_state
            encoder_final_layer_states = encoder_states[:, -1, :, :]
            
            
            loss = 0
            correct = 0
            
            decoder_output, attentions, loss, correct = decoder(decoder_current_state, encoder_final_layer_states, target_batch, loss_fn, use_teacher_forcing)
            total_correct +=correct
            total_loss += loss.item()/data["OUTPUT_MAX_LENGTH"]
            if(batch_num == 0):
                    for j in range(20):
                        print(make_strings(data,source_batch[j],target_batch[j],decoder_output[j]))
            if(batch_num%20 == 0):
                print("ep:", ep, " bt:", batch_num, " loss:", loss.item()/data["OUTPUT_MAX_LENGTH"], " acc: ", correct/h_params["batch_size"])
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()
            
        train_acc = total_correct/total_predictions
        train_loss = total_loss/total_batches
        val_acc, val_loss = evaluate(encoder, decoder, data, val_dataloader,device, h_params, loss_fn, False)
        print("ep: ", ep, " train acc:", train_acc, " train loss:", train_loss, " val acc:", val_acc, " val loss:", val_loss.item()/data["OUTPUT_MAX_LENGTH"])
        wandb.log({"train_accuracy":train_acc, "train_loss":train_loss, "val_accuracy":val_acc, "val_loss":val_loss, "epoch":ep})



In [39]:
# h_params={
#     "char_embd_dim" : 128, 
#     "hidden_layer_neurons":256,
#     "batch_size":32,
#     "number_of_layers":2,
#     "learning_rate":0.0001,
#     "epochs":20,
#     "cell_type":"GRU",
#     "dropout":0.3,
#     "optimizer":"nadam"
# }

def prepare_dataloaders(train_source, train_target, val_source, val_target, h_params):
    data = preprocess_data(copy.copy(train_source), copy.copy(train_target))
    training_data = [data["source_data_seq"], data['target_data_seq']]
    train_dataset = MyDataset(training_data)
    train_dataloader = DataLoader(train_dataset, batch_size=h_params["batch_size"], shuffle=True)

    #prepare validation data
    val_padded_source_strings=add_padding(val_source, data["INPUT_MAX_LENGTH"])
    val_padded_target_strings = add_padding(val_target, data["OUTPUT_MAX_LENGTH"])
    val_source_sequences = generate_string_to_sequence(val_padded_source_strings,  data['source_char_index'])
    val_target_sequences = generate_string_to_sequence(val_padded_target_strings,  data['target_char_index'])
    validation_data = [val_source_sequences, val_target_sequences]
    val_dataset = MyDataset(validation_data)
    val_dataloader = DataLoader(val_dataset, batch_size=h_params["batch_size"], shuffle=True)
    return train_dataloader, val_dataloader, data


In [40]:
def train(h_params, data, device, data_loader, val_dataloader, use_teacher_forcing=True):
    encoder = Encoder(h_params, data, device).to(device)
    decoder = Decoder(h_params, data, device).to(device)
    train_loop(encoder, decoder,h_params, data, data_loader,device, val_dataloader, use_teacher_forcing)
    torch.cuda.empty_cache() 
    del encoder
    del decoder
    gc.collect()

In [41]:
# config = h_params
# # run = wandb.init(project="DL Assignment 3 With Attention", name=f"{config['cell_type']}_{config['optimizer']}_ep_{config['epochs']}_lr_{config['learning_rate']}_embd_{config['char_embd_dim']}_hid_lyr_neur_{config['hidden_layer_neurons']}_bs_{config['batch_size']}_enc_layers_{config['number_of_layers']}_dec_layers_{config['number_of_layers']}_dropout_{config['dropout']}", config=config)
# train_dataloader, val_dataloader, data = prepare_dataloaders(train_source, train_target, val_source, val_target, h_params)
# train(h_params, data, device, train_dataloader, val_dataloader, True)

In [42]:
#Run this cell to run a sweep with appropriate parameters
sweep_params = {
    'method' : 'bayes',
    'name'   : 'DL Assignment 3 With Attention',
    'metric' : {
        'goal' : 'maximize',
        'name' : 'val_accuracy',
    },
    'parameters' : {
        'epochs':{'values' : [15, 20]},
        'learning_rate':{'values' : [0.001, 0.0001]},
        'batch_size':{'values':[32,64, 128]},
        'char_embd_dim':{'values' : [64, 128, 256] } ,
        'number_of_layers':{'values' : [1,2,3,4]},
        'optimizer':{'values':['nadam','adam']},
        'cell_type':{'values' : ["RNN","LSTM", "GRU"]},
        'hidden_layer_neurons':{'values': [ 128, 256, 512]},
        'dropout':{'values': [0,0.2, 0.3]}
    }
}

sweep_id = wandb.sweep(sweep=sweep_params, project="DL Assignment 3 With Attention")
def main():
    torch.cuda.empty_cache()
    gc.collect()
    wandb.init(project="DL Assignment 3" )
    config = wandb.config
    with wandb.init(project="DL Assignment 3", name=f"{config['cell_type']}_{config['optimizer']}_ep_{config['epochs']}_lr_{config['learning_rate']}_embd_{config['char_embd_dim']}_hid_lyr_neur_{config['hidden_layer_neurons']}_bs_{config['batch_size']}_enc_layers_{config['number_of_layers']}_dec_layers_{config['number_of_layers']}_dropout_{config['dropout']}", config=config):
        train_dataloader, val_dataloader, data = prepare_dataloaders(train_source, train_target, val_source, val_target, config)
        train(config, data, device, train_dataloader, val_dataloader, True)
        


Create sweep with ID: 5yt059tf
Sweep URL: https://wandb.ai/jaswanth431/DL%20Assignment%203%20With%20Attention/sweeps/5yt059tf


In [None]:
wandb.agent("f4esgkqv", function=main, count=100)

[34m[1mwandb[0m: Agent Starting Run: wdru7z28 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	char_embd_dim: 256
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_layer_neurons: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_of_layers: 1
[34m[1mwandb[0m: 	optimizer: nadam




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

('<kinkartavya>_________________', '<కింకర్తవ్య>___________', '>>>>>ఐ>>>>>>చచచచచచచచచచచ')
('<chejaripotaronani>___________', '<చేజారిపోతారోనని>______', '>>>>>ఐఐఋఋఖఖఈఖఖఖ>>చచచచచచ')
('<nichesukuntunna>_____________', '<నిచేసుకుంటున్న>_______', '>>>>>>>>>>>>>>>>>చచచచచచ')
('<veyalsinanta>________________', '<వేయాల్సినంత>__________', '>>>ోఈఋ>>>>ఛఖ>చచచచచచచచచచ')
('<erpatluchesamani>____________', '<ఏర్పాట్లుచేశామని>_____', '>>>>>ఈఈ>ఋ>>ఐదఈఈ>>>చచచచచ')
('<padyaran>____________________', '<పద్యరాన్>_____________', '>>>>>ఐఈఛ>>>చచచచచచచచచచచచ')
('<chigurlanu>__________________', '<చిగుర్లను>____________', '>>>>>ఐ>ఋఖ>>>చచచచచచచచచచచ')
('<abhipraayaanike>_____________', '<అభిప్రాయానికే>________', '>>ఖ>ఋ>ఐఈఈఈఛ>>>>>చచచచచచచ')
('<ennikalostaayannaaru>________', '<ఎన్నికలొస్తాయన్నారు>__', '>>>>>>>ఋఋఋ>>>ఈఖ>>>ఈ>>చచ')
('<papkapuram>__________________', '<పప్కాపురం>____________', '>>ఋ>>>ఓ>ఐఖ>చచచచచచచచచచచచ')
('<kaamyamutho>_________________', '<కామ్యముతో>____________', '>>>>>>త>ఖఖ>>చచచచచచచచచచచ')
('<repeaters>________

In [None]:
# torch.cuda.memory_summary(device=None, abbreviated=False)

