In [32]:
import torch
from torch import nn
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [34]:
train_csv = "/kaggle/input/akashar/aksharantar_sampled/tel/tel_train.csv"
test_csv = "/kaggle/input/akashar/aksharantar_sampled/tel/tel_test.csv"
val_csv = "/kaggle/input/akashar/aksharantar_sampled/tel/tel_valid.csv"

In [35]:
train_df = pd.read_csv(train_csv, header=None)
train_source, train_target = train_df[0].to_numpy(), train_df[1].to_numpy();
test_df = pd.read_csv(test_csv, header=None)
val_df = pd.read_csv(val_csv, header=None)

In [36]:
start_token = "<"
end_token  = ">"
def preprocess_data(source_data, target_data):
    data = {
            "source_chars" : [],
            "target_chars" : [],
            "source_char_index" : {},
            "source_index_char" : {},
            "target_char_index" :{},
            "target_index_char" : {},
            "source_len":0,
            "target":0,
            "source_data":source_data,
            "target_data":target_data
           }
    for i in range(len(source_data)):
        source_data[i] = start_token + source_data[i] + end_token
        target_data[i] = start_token + target_data[i] + end_token
        
        for c in source_data[i]:
            if(data["source_char_index"].get(c) == None):
                data["source_chars"].append(c)
                idx = len(data["source_chars"]) -1
                data["source_char_index"][c] = idx
                data["source_index_char"][idx] = c
        for c in target_data[i]:
            if(data["target_char_index"].get(c) == None):
                data["target_chars"].append(c)
                idx = len(data["target_chars"]) -1
                data["target_char_index"][c] = idx
                data["target_index_char"][idx] = c
    data["source_len"] = len(data["source_chars"])
    data["target_len"] = len(data["target_chars"])
    return data;


data = preprocess_data(train_source, train_target)
print(data["source_chars"])
print(data["target_chars"])
print(data["source_len"])
print(data["target_len"])

['<', 'v', 'a', 'r', 'g', 'l', 'i', 'n', 'e', '>', 's', 't', 'd', 'f', 'c', 'm', 'o', 'u', 'w', 'p', 'h', 'k', 'y', 'b', 'j', 'z', 'x', 'q']
['<', 'వ', 'ర', '్', 'గ', 'ా', 'ల', 'ి', 'న', 'ే', '>', 'స', 'త', 'ద', 'ఫ', 'య', 'క', 'ట', 'మ', 'ో', 'ూ', 'ళ', 'ప', 'ధ', 'ు', 'ె', 'ం', 'చ', 'ై', 'డ', 'ఖ', 'ఉ', 'ష', 'ఆ', 'ొ', 'శ', 'అ', 'భ', 'ృ', 'ణ', 'హ', 'జ', 'ీ', 'ఇ', 'బ', 'ఐ', 'ఒ', 'ఎ', 'ౌ', 'థ', 'ఈ', 'ఊ', 'ఏ', 'ఢ', 'ఓ', 'ఔ', 'ఞ', 'ఠ', 'ఘ', 'ఛ', 'ః', 'ఝ', 'ఋ', 'ఱ']
28
64


In [37]:
char_embd_dim=64
input_dim = data["source_len"]
output_dim = data["target_len"]
hidden_layer_neurons = 128
learning_rate  =0.0001

In [38]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, char_embd_dim)
        self.rnn = nn.RNN(char_embd_dim, hidden_layer_neurons,num_layers=1, batch_first=True)
    def forward(self, current_input, prev_state):
        embd_input = self.embedding(current_input).view(1,1,-1)
        output, prev_state = self.rnn(embd_input, prev_state)
        return output, prev_state
    def getInitialState(self):
        return torch.zeros(1,1,hidden_layer_neurons, device=device)

    
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, char_embd_dim)
        self.rnn = nn.RNN(char_embd_dim, hidden_layer_neurons,num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_layer_neurons, output_dim)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, current_input, prev_state):
        embd_input = self.embedding(current_input).view(1,1,-1)
        curr_embd = F.relu(embd_input)
        output, prev_state = self.rnn(curr_embd, prev_state)
        output = self.softmax(self.fc(output))
        return output, prev_state

In [39]:
h_params  ={
    "learning_rate":0.001,
    "epochs":10
}
def train(data, h_params):
    encoder = Encoder().to(device)
    decoder = Decoder().to(device)
    
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=h_params["learning_rate"])
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=h_params["learning_rate"])
    
    loss_fn = nn.NLLLoss()
    for ep in range(h_params["epochs"]):
        
#         len(data["source_data"])
        loss = 0;
        for data_item_idx in range(len(data["source_data"])):
            source_word = data["source_data"][data_item_idx]
            target_word = data["target_data"][data_item_idx]
#             print(source_word, target_word)
            encoder_initial_state = encoder.getInitialState()
            encoder_curr_state = encoder_initial_state
#             print(encoder_curr_state)
            for ch in source_word:
                char_idx = data["source_char_index"][ch]
                char_input_tensor = torch.tensor([char_idx], device=device)
                encoder_output, encoder_current_state = encoder(char_input_tensor,encoder_curr_state)
#                 print(encoder_current_state)
            
            decoder_curr_state = encoder_current_state
#             for i in range(50):
            for i in range(len(target_word)):
                if(i == 0):
                    tar_ch = target_word[i]
                    char_idx = data["target_char_index"][tar_ch]
                    char_input_tensor = torch.tensor([char_idx], device=device)

                target_char_idx = data['target_char_index'][target_word[i]]
                target_char_tensor = torch.tensor([target_char_idx], device=device)
#                 print(target_char_tensor.shape)
                decoder_output, decoder_curr_state = decoder(char_input_tensor, decoder_curr_state)
#                 print(decoder_output)
                topv, topi = decoder_output.topk(1)
#                 print(topv, topi)
                decoder_input_t = topi.squeeze().detach()
#                 decoder_input_t.reshape(1)
#                 print(decoder_input_t.shape)
                char_input_tensor = decoder_input_t;
#                 if(i!=len(target_word)-1):
#                     char_input_tensor = torch.tensor([data['target_char_index'][target_word[i+1]]]) 
#                 print(decoder_output.shape)
                decoder_output = decoder_output[:, -1, :]
#                 print(decoder_output.shape)
            
                loss+=(loss_fn(decoder_output, target_char_tensor)/len(target_word))
                
    
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        print(loss.item()/len(data["source_data"]))
            
    
train(data, h_params)

4259.36625


KeyboardInterrupt: 