In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# import other libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

In [34]:
# define Lang
class Lang:
	def __init__(self, wordList):
		self.char2index = {'A': 0, 'Z': 1}
		self.char2count = {}
		self.index2char = {0: 'A', 1: 'Z'}
		self.n_chars = 2

		for word in wordList:
			self.addWord(word)

	def addWord(self, word):
		for char in word:
			self.addChar(char)

	def addChar(self, char):
		if char not in self.char2index:
			self.char2index[char] = self.n_chars
			self.char2count[char] = 1
			self.index2char[self.n_chars] = char
			self.n_chars += 1
		else:
			self.char2count[char] += 1

	def encode(self, word):
		embedded = []
		for i in range(len(word)):
			embedded.append([self.char2index[word[i]]])
		return Variable(torch.LongTensor(embedded))
	
	def one_hot_encode(self, word):
		one_hot = torch.zeros(len(word), self.n_chars)
		for i in range(len(word)):
			one_hot[i][self.char2index[word[i]]] = 1
		return one_hot
	
	def one_hot_encode_char(self, char):
		one_hot = torch.zeros(1, self.n_chars)
		one_hot[0][self.char2index[char]] = 1
		return one_hot
	
	def decode(self, word):
		decoded = ''
		for i in range(len(word)):
			decoded += self.index2char[word[i]]
		return decoded
	
	def decode_one_hot(self, word):
		decoded = ''
		for i in range(len(word)):
			decoded += self.index2char[word[i].argmax().item()]
		return decoded

In [108]:
def indexesFromSentence(lang, word):
    return [lang.char2index[char] for char in word]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(1)
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)


def tensorsFromPair(pair, inp_lang, out_lang):
    input_tensor = tensorFromSentence(inp_lang, pair[0])
    target_tensor = tensorFromSentence(out_lang, pair[1])
    return (input_tensor.unsqueeze(1), target_tensor)

In [109]:
# create dataset
class AksharantarDataset(Dataset):
	def __init__(self, data, inp_lang, out_lang):
		self.data = data
		self.inp_lang = inp_lang
		self.out_lang = out_lang

	def __len__(self):
		return len(self.data)

	def __getitem__(self, idx):
		if torch.is_tensor(idx):
			idx = idx.tolist()

		inp_seq = self.inp_lang.one_hot_encode(self.data['input_seq'][idx]).unsqueeze(1)
		out_seq = self.out_lang.one_hot_encode(self.data['target_seq'][idx]).unsqueeze(1)

		sample = {'input_seq': inp_seq, 'target_seq': out_seq}
		return sample

In [266]:
class EncoderRNN(nn.Module):
	def __init__(self, input_size, hidden_size):
		super(EncoderRNN, self).__init__()
		self.hidden_size = hidden_size

		self.embedding = nn.Embedding(input_size, hidden_size)
		self.gru = nn.GRU(hidden_size, hidden_size)

	def forward(self, input, hidden):
		embedded = self.embedding(input)
		output = embedded
		output, hidden = self.gru(output, hidden)
		return output, hidden

	def initHidden(self):
		return torch.zeros(1, 1, self.hidden_size)
	
class DecoderRNN(nn.Module):
	def __init__(self, hidden_size, output_size):
		super(DecoderRNN, self).__init__()
		self.hidden_size = hidden_size

		self.embedding = nn.Embedding(output_size, hidden_size)
		self.gru = nn.GRU(hidden_size, hidden_size)
		self.out = nn.Linear(hidden_size, output_size)
		self.softmax = nn.LogSoftmax(dim=1)

	def forward(self, input, hidden):
		output = self.embedding(input).view(1, 1, -1)
		output = F.relu(output)
		output, hidden = self.gru(output, hidden)
		output = self.softmax(self.out(output[0]))
		return output, hidden

	def initHidden(self):
		return torch.zeros(1, 1, self.hidden_size)



In [267]:
lang = 'hin'
train_data = pd.read_csv(f'aksharantar_sampled/{lang}/{lang}_train.csv')
test_data = pd.read_csv(f'aksharantar_sampled/{lang}/{lang}_test.csv')
valid_data = pd.read_csv(f'aksharantar_sampled/{lang}/{lang}_valid.csv')

train_data.columns = ['input_seq', 'target_seq']
test_data.columns = ['input_seq', 'target_seq']
valid_data.columns = ['input_seq', 'target_seq']

inp_lang = Lang(train_data['input_seq'])
out_lang = Lang(train_data['target_seq'])

In [268]:
pairs = [(train_data['input_seq'][i], train_data['target_seq'][i]) for i in range(len(train_data))]

In [269]:
import time
import math
import random


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [270]:
class Seq2Seq(nn.Module):
	def __init__(self, input_size, hidden_size, output_size, n_layers=1):
		super(Seq2Seq, self).__init__()
		self.input_size = input_size
		self.hidden_size = hidden_size
		self.output_size = output_size
		self.n_layers = n_layers

		# encoder and decoder
		self.encoder = EncoderRNN(input_size, hidden_size)
		self.decoder = DecoderRNN(hidden_size, output_size)

	def train(self, input_tensor, target_tensor, encoder_optimizer, decoder_optimizer, criterion, max_length=50):
		encoder_hidden = self.encoder.initHidden()

		encoder_optimizer.zero_grad()
		decoder_optimizer.zero_grad()

		input_length = input_tensor.size(0)
		target_length = target_tensor.size(0)

		encoder_outputs = torch.zeros(max_length, self.encoder.hidden_size)

		loss = 0

		for ei in range(input_length):
			encoder_output, encoder_hidden = self.encoder(
				input_tensor[ei], encoder_hidden)
			encoder_outputs[ei] = encoder_output[0, 0]

		decoder_input = torch.tensor([[0]])

		decoder_hidden = encoder_hidden
		for di in range(target_length):
			decoder_output, decoder_hidden = self.decoder(
				decoder_input, decoder_hidden)
			topv, topi = decoder_output.topk(1)
			decoder_input = topi.squeeze().detach()  # detach from history as input

			loss += criterion(decoder_output, target_tensor[di])
			if decoder_input.item() == 1:
				break

		loss.backward()

		encoder_optimizer.step()
		decoder_optimizer.step()

		return loss.item() / target_length

	def trainIters(self,n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
		start = time.time()
		plot_losses = []
		print_loss_total = 0  # Reset every print_every
		plot_loss_total = 0  # Reset every plot_every

		encoder_optimizer = optim.SGD(self.encoder.parameters(), lr=learning_rate)
		decoder_optimizer = optim.SGD(self.decoder.parameters(), lr=learning_rate)
		training_pairs = [tensorsFromPair(pairs[i], inp_lang, out_lang)
						for i in range(n_iters)]
		criterion = nn.NLLLoss()

		for iter in range(1, n_iters + 1):
			training_pair = training_pairs[iter - 1]
			input_tensor = training_pair[0]
			target_tensor = training_pair[1]

			loss = self.train(input_tensor, target_tensor, encoder_optimizer, decoder_optimizer, criterion)
			print_loss_total += loss
			plot_loss_total += loss

			if iter % print_every == 0:
				print_loss_avg = print_loss_total / print_every
				print_loss_total = 0
				print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
											iter, iter / n_iters * 100, print_loss_avg))

			if iter % plot_every == 0:
				plot_loss_avg = plot_loss_total / plot_every
				plot_losses.append(plot_loss_avg)
				plot_loss_total = 0

	def predict(self, word, max_length = 20):
		input_tensor = inp_lang.encode(word).unsqueeze(1)
		encoder_hidden = self.encoder.initHidden()

		input_length = input_tensor.size(0)

		encoder_outputs = torch.zeros(max_length, self.encoder.hidden_size)

		for ei in range(input_length):
			encoder_output, encoder_hidden = self.encoder(
				input_tensor[ei], encoder_hidden)
			encoder_outputs[ei] = encoder_output[0, 0]

		decoder_input = torch.tensor([[0]])

		decoded_word = ""

		decoder_hidden = encoder_hidden
		for di in range(max_length):
			decoder_output, decoder_hidden = self.decoder(
				decoder_input, decoder_hidden)
			topv, topi = decoder_output.data.topk(1)
			if topi.item() == 1:
				decoded_word+= ('Z')
				break
			else:
				decoded_word+= (out_lang.index2char[topi.item()])

			decoder_input = topi.squeeze().detach()

		return decoded_word


In [271]:
print(pairs[:4])

[('bindhya', 'बिन्द्या'), ('kirankant', 'किरणकांत'), ('yagyopaveet', 'यज्ञोपवीत'), ('ratania', 'रटानिया')]


In [274]:
model = Seq2Seq(inp_lang.n_chars, 256, out_lang.n_chars)

model.trainIters(10000, learning_rate=0.0001, print_every=100)

0m 2s (- 3m 45s) (100 1%) 4.2205
0m 4s (- 3m 24s) (200 2%) 4.0561
0m 5s (- 3m 12s) (300 3%) 3.3816
0m 7s (- 3m 10s) (400 4%) 3.9751
0m 9s (- 3m 4s) (500 5%) 3.2993
0m 11s (- 2m 58s) (600 6%) 2.8330
0m 13s (- 2m 54s) (700 7%) 2.8260
0m 15s (- 2m 53s) (800 8%) 3.0585
0m 17s (- 2m 54s) (900 9%) 3.2794
0m 19s (- 2m 51s) (1000 10%) 3.0329
0m 20s (- 2m 47s) (1100 11%) 3.0625
0m 22s (- 2m 45s) (1200 12%) 2.9987
0m 24s (- 2m 43s) (1300 13%) 3.0309
0m 26s (- 2m 42s) (1400 14%) 3.0220
0m 28s (- 2m 40s) (1500 15%) 3.0687
0m 30s (- 2m 41s) (1600 16%) 3.0023
0m 33s (- 2m 41s) (1700 17%) 3.0391
0m 35s (- 2m 40s) (1800 18%) 2.9802
0m 37s (- 2m 38s) (1900 19%) 2.8965
0m 38s (- 2m 35s) (2000 20%) 3.0440
0m 40s (- 2m 33s) (2100 21%) 2.8499
0m 42s (- 2m 30s) (2200 22%) 2.8577
0m 43s (- 2m 27s) (2300 23%) 2.9826
0m 45s (- 2m 24s) (2400 24%) 2.9859
0m 47s (- 2m 22s) (2500 25%) 2.9463
0m 49s (- 2m 20s) (2600 26%) 3.1061


KeyboardInterrupt: 

In [183]:
model.predict('lol')

'सा््ा<EOS>'

In [207]:
# create a seq2seq model using 2 RNNs
class Seq2Seq(nn.Module):
	def __init__(self, input_size, hidden_size, output_size, n_layers=1):
		super(Seq2Seq, self).__init__()
		self.input_size = input_size
		self.hidden_size = hidden_size
		self.output_size = output_size
		self.n_layers = n_layers

		# encoder and decoder
		self.encoder = nn.RNN(input_size, hidden_size, n_layers)
		self.decoder = nn.RNN(hidden_size, hidden_size, n_layers)

		# linear layer to get output
		self.linear = nn.Linear(hidden_size, output_size)

	def forward(self, input, hidden):
		# encoder
		output, hidden = self.encoder(input, hidden)
		
		# decoder
		output, hidden = self.decoder(output, hidden)
		
		# get output
		output = self.linear(output)
		return output, hidden
	
	def predict(self, input, inp_lang, out_lang):
		out, hidden = self.forward(inp_lang.one_hot_encode(input).unsqueeze(1), self.init_hidden(1))
		return out_lang.decode_one_hot(out)
	
	def init_hidden(self, batch_size):
		return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

In [208]:
class Translator:
	def __init__(self, lang):
		train_data = pd.read_csv(f'aksharantar_sampled/{lang}/{lang}_train.csv')
		test_data = pd.read_csv(f'aksharantar_sampled/{lang}/{lang}_test.csv')
		valid_data = pd.read_csv(f'aksharantar_sampled/{lang}/{lang}_valid.csv')

		train_data.columns = ['input_seq', 'target_seq']
		test_data.columns = ['input_seq', 'target_seq']
		valid_data.columns = ['input_seq', 'target_seq']

		self.inp_lang = Lang(train_data['input_seq'])
		self.out_lang = Lang(train_data['target_seq'])

		self.model = Seq2Seq(self.inp_lang.n_chars, 10, self.out_lang.n_chars, 1)
		self.criterion = nn.CrossEntropyLoss()
		self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

		train_dataset = AksharantarDataset(train_data, self.inp_lang, self.out_lang)
		test_dataset = AksharantarDataset(test_data, self.inp_lang, self.out_lang)
		valid_dataset = AksharantarDataset(valid_data, self.inp_lang, self.out_lang)

		self.train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
		self.test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
		self.valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=True)

	def translate(self, word):
		return self.model.predict(word, self.inp_lang, self.out_lang)
	
	def train_one(self, inp, target):
		# zero gradients
		self.optimizer.zero_grad()
		
		# initialize hidden layer
		hidden = self.model.init_hidden(1)
		
		# get output
		output, hidden = self.model.forward(inp, hidden)
		
		mx_len = min(len(output), len(target))

		# append output and target with 'Z' to make them of mx_len
		while(len(output) < mx_len):
			output = torch.cat((output, self.out_lang.one_hot_encode_char('Z')), 0)

		while(len(target) < mx_len):
			target = torch.cat((target, self.out_lang.one_hot_encode_char('Z')), 0)
		
		# calculate loss 
		loss = self.criterion(torch.flatten(output[:mx_len], 0, 1), torch.flatten(target[:mx_len], 0, 1).max(1)[1])
			
		# backpropagate
		loss.backward()
		
		# update weights
		self.optimizer.step()

		return loss.data.item() / len(inp)
	
	def train_epoch(self, data_loader):
		loss = 0
		for i_batch, sample_batched in tqdm(enumerate(data_loader)):
			loss += self.train_one(sample_batched['input_seq'][0], sample_batched['target_seq'][0])
		print(' Loss: ', loss / len(data_loader))
		return loss / len(data_loader)
	
	def train(self, epochs):
		losses = []
		for epoch in range(epochs):
			print('Epoch ', epoch + 1)
			loss = self.train_epoch(self.train_dataloader)
			losses.append(loss)
		return losses

In [254]:
hindi_trans = Translator('urd')
hindi_trans.train(1)

Epoch  1


51199it [07:55, 107.77it/s]

 Loss:  0.2866536472512427





[0.2866536472512427]

In [264]:
hindi_trans.translate('saptarshi')

'ساریااااا'

In [265]:
pickle.dump(hindi_trans, open('urd_trans', 'wb'))

In [211]:
guj_trans = Translator('guj')
guj_trans.train(1)

Epoch  1


51199it [03:09, 270.60it/s]

 Loss:  0.2978701331155101





[0.2978701331155101]

In [252]:
guj_trans.translate('f')

'ફ'

In [223]:

pickle.dump(guj_trans, open('guj_trans', 'wb'))

In [None]:
kan_trans = Translator('kan')
kan_trans.train(1)

Epoch  1


51199it [05:19, 160.41it/s]

 Loss:  0.18516780389609597





[0.18516780389609597]

In [None]:
kan_trans.translate('abdullah')

'ಅಭ್ದಲಲಳಿ'