In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# import other libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
# define Lang
class Lang:
	def __init__(self, wordList):
		self.char2index = {'A': 0, 'Z': 1}
		self.char2count = {}
		self.index2char = {0: 'A', 1: 'Z'}
		self.n_chars = 2

		for word in wordList:
			self.addWord(word)

	def addWord(self, word):
		for char in word:
			self.addChar(char)

	def addChar(self, char):
		if char not in self.char2index:
			self.char2index[char] = self.n_chars
			self.char2count[char] = 1
			self.index2char[self.n_chars] = char
			self.n_chars += 1
		else:
			self.char2count[char] += 1

	def encode(self, word):
		embedded = []
		for i in range(len(word)):
			embedded.append([self.char2index[word[i]]])
		return Variable(torch.LongTensor(embedded))

	def one_hot_encode(self, word):
		one_hot = torch.zeros(len(word), self.n_chars)
		for i in range(len(word)):
			one_hot[i][self.char2index[word[i]]] = 1
		return one_hot
	
	def one_hot_encode_char(self, char):
		one_hot = torch.zeros(1, self.n_chars)
		one_hot[0][self.char2index[char]] = 1
		return one_hot
	
	def decode(self, word):
		decoded = ''
		for i in range(len(word)):
			decoded += self.index2char[word[i]]
		return decoded
	
	def decode_one_hot(self, word):
		decoded = ''
		for i in range(len(word)):
			decoded += self.index2char[word[i].argmax().item()]
		return decoded

In [3]:
# create dataset
class AksharantarDataset(Dataset):
	def __init__(self, data, inp_lang, out_lang):
		self.data = data
		self.inp_lang = inp_lang
		self.out_lang = out_lang

	def __len__(self):
		return len(self.data)

	def __getitem__(self, idx):
		if torch.is_tensor(idx):
			idx = idx.tolist()

		inp_seq = self.inp_lang.one_hot_encode(self.data['input_seq'][idx]).unsqueeze(1)
		out_seq = self.out_lang.one_hot_encode(self.data['target_seq'][idx]).unsqueeze(1)

		sample = {'input_seq': inp_seq, 'target_seq': out_seq}
		return sample

In [4]:
# create a seq2seq model using 2 RNNs
class Seq2Seq(nn.Module):
	def __init__(self, input_size, hidden_size, output_size, n_layers=1):
		super(Seq2Seq, self).__init__()
		self.input_size = input_size
		self.hidden_size = hidden_size
		self.output_size = output_size
		self.n_layers = n_layers
				
		# encoder and decoder
		self.encoder = nn.RNN(input_size, hidden_size, n_layers)
		self.decoder = nn.RNN(hidden_size, hidden_size, n_layers)

		# linear layer to get output
		self.linear = nn.Linear(hidden_size, output_size)

	def forward(self, input, hidden):
		# encoder
		output, hidden = self.encoder(input, hidden)
		
		# decoder
		output, hidden = self.decoder(output, hidden)
		
		# get output
		output = self.linear(output)
		return output, hidden
	
	def predict(self, input, inp_lang, out_lang):
		out, hidden = self.forward(inp_lang.one_hot_encode(input).unsqueeze(1), self.init_hidden(1))
		return out_lang.decode_one_hot(out)
	
	def init_hidden(self, batch_size):
		return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

In [5]:
class Translator:
	def __init__(self, lang):
		train_data = pd.read_csv(f'aksharantar_sampled/{lang}/{lang}_train.csv')
		test_data = pd.read_csv(f'aksharantar_sampled/{lang}/{lang}_test.csv')
		valid_data = pd.read_csv(f'aksharantar_sampled/{lang}/{lang}_valid.csv')

		train_data.columns = ['input_seq', 'target_seq']
		test_data.columns = ['input_seq', 'target_seq']
		valid_data.columns = ['input_seq', 'target_seq']

		self.inp_lang = Lang(train_data['input_seq'])
		self.out_lang = Lang(train_data['target_seq'])

		self.model = Seq2Seq(self.inp_lang.n_chars, 128, self.out_lang.n_chars, 1)
		self.criterion = nn.CrossEntropyLoss()
		self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

		train_dataset = AksharantarDataset(train_data, self.inp_lang, self.out_lang)
		test_dataset = AksharantarDataset(test_data, self.inp_lang, self.out_lang)
		valid_dataset = AksharantarDataset(valid_data, self.inp_lang, self.out_lang)

		self.train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
		self.test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
		self.valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=True)

	def translate(self, word):
		return self.model.predict(word, self.inp_lang, self.out_lang)
	
	def train_one(self, inp, target):
		# zero gradients
		self.optimizer.zero_grad()
		
		# initialize hidden layer
		hidden = self.model.init_hidden(1)
		
		# get output
		output, hidden = self.model.forward(inp, hidden)
		
		mx_len = min(len(output), len(target))

		# append output and target with 'Z' to make them of mx_len
		while(len(output) < mx_len):
			output = torch.cat((output, self.out_lang.one_hot_encode_char('Z')), 0)

		while(len(target) < mx_len):
			target = torch.cat((target, self.out_lang.one_hot_encode_char('Z')), 0)
		
		# calculate loss 
		loss = self.criterion(torch.flatten(output[:mx_len], 0, 1), torch.flatten(target[:mx_len], 0, 1).max(1)[1])
			
		# backpropagate
		loss.backward()
		
		# update weights
		self.optimizer.step()

		return loss.data.item() / len(inp)
	
	def train_epoch(self, data_loader):
		loss = 0
		for i_batch, sample_batched in tqdm(enumerate(data_loader)):
			loss += self.train_one(sample_batched['input_seq'][0], sample_batched['target_seq'][0])
		print(' Loss: ', loss / len(data_loader))
		return loss / len(data_loader)
	
	def train(self, epochs):
		losses = []
		for epoch in range(epochs):
			print('Epoch ', epoch + 1)
			loss = self.train_epoch(self.train_dataloader)
			losses.append(loss)
		return losses

In [6]:
# hindi_trans = Translator('hin')
# hindi_trans.train(5)

In [7]:
kan_trans = Translator('kan')
kan_trans.train(1)

Epoch  1


51199it [05:19, 160.41it/s]

 Loss:  0.18516780389609597





[0.18516780389609597]

In [20]:
kan_trans.translate('abdullah')

'ಅಭ್ದಲಲಳಿ'