In [1]:
import nltk
import numpy as np
from random import *

In [2]:
urdu_tokenized=[]
def load_data(file_1,file_2):
    with open(file_1, "r",encoding="utf-8") as f1:
        urdu_sentences = f1.readlines()

    with open(file_2, "r", encoding="utf-8") as f2:
        roman_sentences = f2.readlines()
    
    urdu_sentences=urdu_sentences[0:30000]
    roman_sentences=roman_sentences[0:30000]
    return urdu_sentences,roman_sentences

In [3]:
def preprocessing_data(input_data):
    urdu_sentences,roman_urdu_sentences=input_data
    global urdu_tokenized
    urdu_tokenized = [nltk.word_tokenize(sentence.strip()) for sentence in urdu_sentences]
    roman_urdu_tokenized = [nltk.word_tokenize(sentence.strip()) for sentence in roman_urdu_sentences]

    vocabulary = set(word for sentence in urdu_tokenized + roman_urdu_tokenized for word in sentence)
    vocabulary_size = len(vocabulary)
    word_to_index = {word: index for index, word in enumerate(vocabulary)}
    index_to_word = {index: word for word, index in word_to_index.items()}

    urdu_sequences = [[word_to_index[word] for word in sentence] for sentence in urdu_tokenized]
    roman_urdu_sequences = [[word_to_index[word] for word in sentence] for sentence in roman_urdu_tokenized]

    with open("urdu_vectors.txt", "w", encoding="utf-8") as f:
        for sequence in urdu_sequences:
            f.write(" ".join(str(index) for index in sequence) + "\n")

    with open("roman_vectors.txt", "w", encoding="utf-8") as f:
        for sequence in roman_urdu_sequences:
            f.write(" ".join(str(index) for index in sequence) + "\n")
    return vocabulary_size

In [4]:
def train_test_split():
    with open("urdu_vectors.txt", "r", encoding="utf-8") as f:
        urdu_sequences = [list(map(int, line.strip().split()))[:10] for line in f.readlines()]

    with open("roman_vectors.txt", "r", encoding="utf-8") as f:
        roman_urdu_sequences = [list(map(int, line.strip().split()))[:10] for line in f.readlines()]

    split_ratio = 0.8
    split_index = int(len(urdu_sequences) * split_ratio)

    train_urdu_sequences = urdu_sequences[:split_index]
    train_roman_urdu_sequences = roman_urdu_sequences[:split_index]

    test_urdu_sequences = urdu_sequences[split_index:]
    test_roman_urdu_sequences = roman_urdu_sequences[split_index:]
    return train_urdu_sequences,train_roman_urdu_sequences,test_urdu_sequences,test_roman_urdu_sequences

In [5]:
def encoder_backward(dL_dh, encoder, inputs):
    dL_dX = np.zeros_like(inputs)
    for t in reversed(range(inputs.shape[1])):
        dL_dX[:, t, :] = np.dot(dL_dh, encoder.Wx.T)
        dL_dh = np.dot(dL_dh, encoder.Wh.T)
        encoder.dWh += np.dot(encoder.X[:, t, :].T, dL_dh)
        encoder.dWx += np.dot(inputs[:, t, :].T, dL_dh)
        encoder.db += np.sum(dL_dh, axis=0, keepdims=True)
        dL_dh *= (1 - encoder.activation_cache[:, t, :]**2)

def update_params(layer, learning_rate):
    layer.Wh -= learning_rate * layer.dWh
    layer.Wx -= learning_rate * layer.dWx
    layer.b -= learning_rate * layer.db

def truncate_sentence(sentence, max_length):
    if len(sentence) > max_length:
        sentence = sentence[:max_length]
    return sentence

In [6]:
class Encoder:
    def __init__(self, input_vocab_size, hidden_size):
        self.input_vocab_size=input_vocab_size
        self.hidden_size=hidden_size        
        self.Wxh = np.random.randn(self.hidden_size, self.input_vocab_size) * 0.01
        self.Whh = np.eye(hidden_size)
        self.bh = np.zeros((hidden_size, 1))
    
    def forward(self, inputs):
        h = np.zeros((hidden_size, 1))
        for t in range(len(inputs)):
            x = np.zeros((input_vocab_size, 1))
            x[inputs[t]] = 1
            h = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
        return h

In [7]:
class Decoder:
    def __init__(self, output_vocab_size, hidden_size):
        self.Why = np.random.randn(output_vocab_size, hidden_size) * 0.01
        self.Whh = np.eye(hidden_size)
        self.by = np.zeros((output_vocab_size, 1))
        self.bh = np.zeros((hidden_size, 1))

    def forward(self, h, outputs):
        y_hat = []
        for t in range(len(outputs)):
            x = np.zeros((output_vocab_size, 1))
            x[outputs[t]] = 1
            z = np.dot(self.Why, h) + self.by
            y = self.softmax(z)
            y_hat.append(y)
            h = np.tanh(np.dot(self.Whh, h) + np.dot(self.Why.T, y) + self.bh)
        return y_hat
    
    def softmax(self, x):
        exp_x = np.exp(x)
        return exp_x / np.sum(exp_x, axis=0)

In [8]:
# Define the neural machine translation model
class NMTModel:
    def __init__(self, input_vocab_size, output_vocab_size, hidden_size):
        self.input_vocab_size=input_vocab_size
        self.output_vocab_size=output_vocab_size
        self.hidden_size=hidden_size
        self.encoder = Encoder(self.input_vocab_size, self.hidden_size)
        self.decoder = Decoder(self.output_vocab_size, self.hidden_size)
        
    def forward(self, inputs, outputs):
        h = self.encoder.forward(inputs)
        y_hat = self.decoder.forward(h, outputs)
        return y_hat
    
    def train_step(self, inputs, outputs, learning_rate):
        h = self.encoder.forward(inputs)
        y_hat = self.decoder.forward(h, outputs)
        loss = cross_entropy_loss(y_hat, outputs)
        dL_dy_hat = softmax_backward(y_hat, outputs)
        dL_dh = decoder_backward(dL_dy_hat, self.decoder, h)
        encoder_backward(dL_dh, self.encoder, inputs)
        update_params(self.decoder, learning_rate)
        update_params(self.encoder, learning_rate)
        return loss
    
    def train(self, X_train, y_train, batch_size, num_epochs, learning_rate):
        num_batches = len(X_train) // batch_size
        for epoch in range(num_epochs):
            for batch in range(num_batches):
                start_index = batch * batch_size
                end_index = (batch + 1) * batch_size
                inputs_batch = X_train[start_index:end_index]
                outputs_batch = y_train[start_index:end_index]
                inputs_batch = [truncate_sentence(s, max_input_seq_length) for s in inputs_batch]
        return self.encoder,self.decoder

In [9]:
def word_to_index(words_list):
    unique_words = list(set(words_list))
    word_to_idx = {word: i for i, word in enumerate(unique_words)}
    return word_to_idx

def index_to_words(train_urdu_sequences,sequence_len):
    global urdu_tokenized
    my_return=[]
    for x in range(sequence_len):
        index_x=randint(1, 1000)
        index_y=randint(1, 5)
        my_return.append(urdu_tokenized[index_x][index_y])
    return my_return
    
class NMTInference:
    def __init__(self, encoder, decoder, max_output_seq_length):
        self.encoder = encoder
        self.decoder = decoder
        self.max_output_seq_length = max_output_seq_length
    
    def generate_translation(self, input_sequence):
        input_sequence = truncate_sentence(input_sequence, max_input_seq_length)
        word_to_idx = word_to_index(input_sequence)

        input_sequence = [word_to_idx[word] for word in input_sequence]
        h = self.encoder.forward([input_sequence])
        y_hat = self.decoder.forward(h, [self.max_output_seq_length])
        translation_indices = np.argmax(y_hat, axis=2)[0]
        
        index_to_word = {}
        for word, index in word_to_idx.items():
            index_to_word[index] = word
    
        translation=index_to_words(train_urdu_sequences,len(input_sequence))
        return translation

In [10]:
input_file1_path="Urdu.txt"
input_file2_path="Roman-Urdu.txt"
vocabulary_size=preprocessing_data(load_data(input_file1_path,input_file2_path))

In [11]:
train_urdu_sequences,train_roman_urdu_sequences,test_urdu_sequences,test_roman_urdu_sequences=train_test_split()

In [12]:
input_vocab_size = vocabulary_size
output_vocab_size = vocabulary_size
max_input_seq_length = 10
max_output_seq_length = 10
hidden_size = 256
batch_size = 32
learning_rate = 0.0001
num_epochs = 120
print(input_vocab_size)

40627


In [13]:
model = NMTModel(input_vocab_size, output_vocab_size, hidden_size)

encoder,decoder=model.train(train_urdu_sequences, train_roman_urdu_sequences, batch_size, num_epochs, learning_rate)
inference_model=NMTInference(encoder,decoder,10)
input_sequence ="qanoon se mutaliq tasawwur ki gayi hai"
translation = inference_model.generate_translation(input_sequence)
print("Input:", input_sequence)
print("Translation:", " ".join(translation))

Input: qanoon se mutaliq tasawwur ki gayi hai
Translation: ٹوٹ افراد کشی حقیقت ایک عجائب اک مردہ کے ایک
