In [1]:
import numpy as np
import torch
import math
from torch import nn
import torch.nn.functional as F

def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN, UNK_TOKEN='<UNK>'):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
        self.UNK_TOKEN = UNK_TOKEN
        # Ensure UNK_TOKEN is in the vocabulary
        if UNK_TOKEN not in language_to_index:
            self.language_to_index[UNK_TOKEN] = len(language_to_index)  # Assign a new index

    def batch_tokenize(self, batch, start_token, end_token):
        def tokenize(sentence, start_token, end_token):
            sentence_word_indices = [self.language_to_index.get(token, self.language_to_index[self.UNK_TOKEN]) for token in list(sentence)]
            if start_token:
                sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indices.append(self.language_to_index[self.END_TOKEN])

            # Truncate the sentence if it exceeds the maximum length
            sentence_word_indices = sentence_word_indices[:self.max_sequence_length]

            # Pad the sentence if it is shorter than the maximum length
            while len(sentence_word_indices) < self.max_sequence_length:
                sentence_word_indices.append(self.language_to_index[self.PADDING_TOKEN])

            return torch.tensor(sentence_word_indices)

        tokenized = []
        for sentence_num in range(len(batch)):
            tokenized.append(tokenize(batch[sentence_num], start_token, end_token))
        return torch.stack(tokenized).to(get_device())

    def forward(self, x, start_token, end_token): # sentence
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(get_device())
        x = self.dropout(x + pos)
        return x



class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out


class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x

class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x

class Encoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])

    def forward(self, x, self_attention_mask, start_token, end_token):
        x = self.sentence_embedding(x, start_token, end_token)
        x = self.layers(x, self_attention_mask)
        return x


class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model , 2 * d_model)
        self.q_layer = nn.Linear(d_model , d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y, mask):
        batch_size, sequence_length, d_model = x.size()
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        return out


class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(parameters_shape=[d_model])
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)

        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)

        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y


class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

class Decoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token, end_token):
        y = self.sentence_embedding(y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y


class Transformer(nn.Module):
    def __init__(self,
                d_model,
                ffn_hidden,
                num_heads,
                drop_prob,
                num_layers,
                max_sequence_length,
                kn_vocab_size,
                english_to_index,
                kannada_to_index,
                START_TOKEN,
                END_TOKEN,
                PADDING_TOKEN
                ):
        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, kannada_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = nn.Linear(d_model, kn_vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=True, # We should make this true
                dec_end_token=False): # x, y are batch of sentences
        x = self.encoder(x, encoder_self_attention_mask, start_token=enc_start_token, end_token=enc_end_token)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask, start_token=dec_start_token, end_token=dec_end_token)
        out = self.linear(out)
        return out

In [2]:
if __name__ == "__main__":
    from __main__ import Transformer

In [3]:
import torch
import numpy as np
from pprint import pprint

In [None]:
# import re

# # Open the file and read the content
# with open('norm_english.txt', 'r', encoding='utf-8') as file:
#     lines = file.readlines()

# # Process the lines
# cleaned_lines = []
# for line in lines:
#     # Strip whitespace
#     line = line.strip()
#     # Skip empty or irrelevant lines
#     if not line or line.isspace():
#         continue
#     # Optional: Filter for English characters (A-Z, a-z, space, punctuation)
#     if any(char.isalpha() or char in [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
#                                       ':', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
#            for char in line):
#         # Remove unnecessary double quotes within the text
#         line = re.sub(r'(?<!\w)"|"(?!\w)', '', line)
#         # Remove extra spaces (e.g., multiple spaces between words)
#         line = re.sub(r'\s+', ' ', line)
#         # Remove non-ASCII characters if any
#         line = re.sub(r'[^\x00-\x7F]+', '', line)
#         # Optionally, make the text lowercase if needed
#         line = line.lower()
#         cleaned_lines.append(line)

# # Write the cleaned content to a new file
# with open('cleaned_english.txt', 'w', encoding='utf-8') as file:
#     file.write("\n".join(cleaned_lines))

# print(f"File cleaned successfully. {len(cleaned_lines)} lines retained.")


In [None]:
# import re

# # Open the file and read the content
# with open('norm_tamil.txt', 'r', encoding='utf-8') as file:
#     lines = file.readlines()

# # Process the lines
# cleaned_lines = []
# skipped_lines = []

# for line in lines:
#     # Strip whitespace
#     original_line = line.strip()
#     if not original_line or original_line.isspace():
#         skipped_lines.append(line)
#         continue

#     # Filter for Tamil characters (Unicode range U+0B80–U+0BFF)
#     cleaned_line = re.sub(r'[^\u0B80-\u0BFF\s]', '', original_line)  # Keep only Tamil characters and spaces
#     if cleaned_line:
#         cleaned_lines.append(cleaned_line)
#     else:
#         skipped_lines.append(original_line)

# # Write the cleaned content to a new file
# with open('cleaned_tamil.txt', 'w', encoding='utf-8') as file:
#     file.write("\n".join(cleaned_lines))

# # Display skipped lines
# # print("Skipped lines:")
# # for line in skipped_lines:
# #     print(line.strip())

# print(f"File cleaned successfully. {len(cleaned_lines)} lines retained, {len(skipped_lines)} lines skipped.")




In [None]:
# import re

# # Open the file and read the content
# with open('norm_tamil.txt', 'r', encoding='utf-8') as file:
#     lines = file.readlines()

# # Process the lines
# cleaned_lines = []
# skipped_lines = []

# for line in lines:
#     # Strip whitespace
#     original_line = line.strip()
#     if not original_line or original_line.isspace():
#         skipped_lines.append(line)
#         continue

#     # Filter for Tamil characters (Unicode range U+0B80–U+0BFF), spaces, and digits
#     cleaned_line = re.sub(r'[^\u0B80-\u0BFF\s\d]', '', original_line)  # Keep Tamil characters, spaces, and numbers
#     if cleaned_line:
#         cleaned_lines.append(cleaned_line)
#     else:
#         skipped_lines.append(original_line)

# # Write the cleaned content to a new file
# with open('cleaned_tamil.txt', 'w', encoding='utf-8') as file:
#     file.write("\n".join(cleaned_lines))

# # # Display skipped lines
# # print("Skipped lines:")
# # for line in skipped_lines:
# #     print(line.strip())

# print(f"File cleaned successfully. {len(cleaned_lines)} lines retained, {len(skipped_lines)} lines skipped.")

In [None]:
# import re

# # Open the file and read the content
# with open('norm_tamil.txt', 'r', encoding='utf-8') as file:
#     lines = file.readlines()

# # Process the lines
# cleaned_lines = []
# skipped_lines = []
# skipped_indices = []  # List to keep track of the indices of skipped lines

# for index, line in enumerate(lines):
#     # Strip whitespace
#     original_line = line.strip()
#     if not original_line or original_line.isspace():
#         skipped_lines.append(original_line)
#         skipped_indices.append(index)
#         continue

#     # Filter for Tamil characters (Unicode range U+0B80–U+0BFF), spaces, and digits
#     cleaned_line = re.sub(r'[^\u0B80-\u0BFF\s\d]', '', original_line)
#     if cleaned_line:
#         cleaned_lines.append(cleaned_line)
#     else:
#         skipped_lines.append(original_line)
#         skipped_indices.append(index)

# # Write the cleaned content to a new file
# with open('cleaned_tamil.txt', 'w', encoding='utf-8') as file:
#     file.write("\n".join(cleaned_lines))

# # Print out skipped lines and their indices
# print("Skipped lines with indices:")
# for line, index in zip(skipped_lines, skipped_indices):
#     print(f"Index {index}: {line}")

# print(f"File cleaned successfully. {len(cleaned_lines)} lines retained, {len(skipped_lines)} lines skipped.")


In [4]:
import re

# Open both files and try a different encoding if UTF-8 fails
try:
    with open('norm_tamil.txt', 'r', encoding='utf-8') as tamil_file:
        tamil_lines = tamil_file.readlines()
except UnicodeDecodeError:
    with open('norm_tamil.txt', 'r', encoding='ISO-8859-1') as tamil_file:
        tamil_lines = tamil_file.readlines()

with open('norm_english.txt', 'r', encoding='utf-8') as english_file:
    english_lines = english_file.readlines()

# Process the lines
cleaned_tamil_lines = []
cleaned_english_lines = []
skipped_indices = []

assert len(tamil_lines) == len(english_lines), "Files do not have the same number of lines."

for index, (tamil_line, english_line) in enumerate(zip(tamil_lines, english_lines)):
    original_tamil_line = tamil_line.strip()
    original_english_line = english_line.strip()
    cleaned_tamil_line = re.sub(r'[^\u0B80-\u0BFF\s\d\?\.!,;A-Z]', '', original_tamil_line)
    if cleaned_tamil_line:
        cleaned_tamil_lines.append(cleaned_tamil_line)
        cleaned_english_lines.append(original_english_line)
    else:
        skipped_indices.append(index)

with open('cleaned_tamil.txt', 'w', encoding='utf-8') as tamil_file:
    tamil_file.write("\n".join(cleaned_tamil_lines))
with open('cleaned_english.txt', 'w', encoding='utf-8') as english_file:
    english_file.write("\n".join(cleaned_english_lines))

print("Skipped indices and lines:")
for index in skipped_indices:
    print(f"Index {index}: Tamil -> '{tamil_lines[index].strip()}', English -> '{english_lines[index].strip()}'")

print(f"File cleaned successfully. {len(cleaned_tamil_lines)} Tamil lines retained, {len(cleaned_english_lines)} English lines retained, {len(skipped_indices)} lines skipped.")


Skipped indices and lines:
File cleaned successfully. 102979 Tamil lines retained, 102979 English lines retained, 0 lines skipped.


In [5]:
english_file = 'cleaned_english.txt'
tamil_file = 'cleaned_tamil.txt'

START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'
UNK_TOKEN = '<UNK>'  # Define the unknown token

tamil_vocabulary = [
    UNK_TOKEN,  # Include the UNK token
    START_TOKEN, ' ', '!', '\"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    ':', ';', '<', '=', '>', '?', '@',
    '[', '\\', ']', '^', '_', '`', '’',
    'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ',
    'க', 'ங', 'ச', 'ஜ', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ன', 'ப', 'ம', 'ய', 'ர', 'ற', 'ல', 'ள', 'ழ', 'வ', 'ஷ', 'ஸ', 'ஹ',
    'ா', 'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ௌ', '்',
    '௦', '௧', '௨', '௩', '௪', '௫', '௬', '௭', '௮', '௯',
    '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN
]

english_vocabulary = [
    UNK_TOKEN,  # Include the UNK token
    START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    ':', '<', '=', '>', '?', '@', ';',
    '[', '\\', ']', '^', '_', '`', '’',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
    'y', 'z',
    '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN
]


In [6]:
index_to_tamil = {k:v for k,v in enumerate(tamil_vocabulary)}
tamil_to_index = {v:k for k,v in enumerate(tamil_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}


In [7]:
with open(english_file, 'r', encoding='utf-8') as file:
    english_sentences = file.readlines()
with open(tamil_file, 'r', encoding='utf-8') as file:
    tamil_sentences = file.readlines()

# Limit Number of sentences
TOTAL_SENTENCES = 200000
english_sentences = english_sentences[:TOTAL_SENTENCES]
tamil_sentences = tamil_sentences[:TOTAL_SENTENCES]
english_sentences = [sentence.rstrip('\n').lower() for sentence in english_sentences]
tamil_sentences = [sentence.rstrip('\n') for sentence in tamil_sentences]


In [None]:
english_sentences[19990]

"very well , thanks . nice to see you again . i haven't seen you for a long time . what have you been doing lately ?"

In [None]:
tamil_sentences[19990]

'மிகவும் நல்லது, நன்றி. உங்களை மீண்டும் சந்திப்பதில் மகிழ்ச்சி. நான் உன்னை நீண்ட நாட்களாக பார்க்கவில்லை. நீங்கள் சமீபத்தில் என்ன செய்து கொண்டிருந்தீர்கள்?'

In [8]:
import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length Tamil: {np.percentile([len(x) for x in tamil_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )


97th percentile length Tamil: 212.0
97th percentile length English: 181.0


In [9]:
max_sequence_length = 200

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1)

valid_sentence_indicies = []
for index in range(len(tamil_sentences)):
    tamil_sentence, english_sentence = tamil_sentences[index], english_sentences[index]
    if is_valid_length(tamil_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(tamil_sentence, tamil_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(tamil_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")


Number of sentences: 102979
Number of valid sentences: 97827


In [10]:
tamil_sentences = [tamil_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

In [None]:
tamil_sentences[:5]

['ஏய் மனிதனே, நீங்கள் கொஞ்சம் களை வாங்க விரும்புகிறீர்களா?',
 'சில என்ன?',
 'களை ! உனக்கு தெரியுமா ? பானை, கஞ்சா, மேரி ஜேன் சில நாள்பட்டது!',
 'ஓ , ம்ம் , இல்லை நன்றி .',
 'நீங்கள் ஒரு சில வரிகளை செய்ய விரும்பினால் எனக்கும் அடி உள்ளது.']

In [11]:
import torch

d_model = 512
batch_size = 30
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 2
max_sequence_length = 200
tm_vocab_size = len(tamil_vocabulary)

transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob, #drop out rate to prevent overfitting
                          num_layers,
                          max_sequence_length,
                          tm_vocab_size,
                          english_to_index,
                          tamil_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)


In [12]:
transformer

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(74, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (attention): MultiHeadAt

In [13]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, english_sentences, tamil_sentences):
        self.english_sentences = english_sentences
        self.tamil_sentences = tamil_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.tamil_sentences[idx]


In [14]:
dataset = TextDataset(english_sentences, tamil_sentences)

In [15]:
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)


In [16]:
from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=tamil_to_index[PADDING_TOKEN],
                                reduction='none')

for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [17]:
NEG_INFTY = -1e9

def create_masks(eng_batch, tm_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, tm_sentence_length = len(eng_batch[idx]), len(tm_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      tm_chars_to_padding_mask = np.arange(tm_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, tm_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, tm_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, tm_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask


In [44]:
transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    iterator = iter(train_loader)
    for batch_num, batch in enumerate(iterator):
        transformer.train()
        eng_batch, tm_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, tm_batch)
        optim.zero_grad()
        tm_predictions = transformer(eng_batch,
                                     tm_batch,
                                     encoder_self_attention_mask.to(device),
                                     decoder_self_attention_mask.to(device),
                                     decoder_cross_attention_mask.to(device),
                                     enc_start_token=False,
                                     enc_end_token=False,
                                     dec_start_token=True,
                                     dec_end_token=True)
        labels = transformer.decoder.sentence_embedding.batch_tokenize(tm_batch, start_token=False, end_token=True)
        loss = criterian(
            tm_predictions.view(-1, tm_vocab_size).to(device),
            labels.view(-1).to(device)
        ).to(device)
        valid_indicies = torch.where(labels.view(-1) == tamil_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optim.step()
        #train_losses.append(loss.item())
        if batch_num % 100 == 0:
            print(f"Iteration {batch_num} : {loss.item()}")
            print(f"English: {eng_batch[0]}")
            print(f"Tamil Translation: {tm_batch[0]}")
            tm_sentence_predicted = torch.argmax(tm_predictions[0], axis=1)
            predicted_sentence = ""
            for idx in tm_sentence_predicted:
              if idx == tamil_to_index[END_TOKEN]:
                break
              predicted_sentence += index_to_tamil[idx.item()]
            print(f"Tamil Prediction: {predicted_sentence}")


            transformer.eval()
            tm_sentence = ("",)
            eng_sentence = ("hello how are you?",)
            for word_counter in range(max_sequence_length):
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, tm_sentence)
                predictions = transformer(eng_sentence,
                                          tm_sentence,
                                          encoder_self_attention_mask.to(device),
                                          decoder_self_attention_mask.to(device),
                                          decoder_cross_attention_mask.to(device),
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
                next_token_prob_distribution = predictions[0][word_counter] # not actual probs
                next_token_index = torch.argmax(next_token_prob_distribution).item()
                next_token = index_to_tamil[next_token_index]
                tm_sentence = (tm_sentence[0] + next_token, )
                if next_token == END_TOKEN:
                  break

            print(f"Evaluation translation (hello how are you?) : {tm_sentence}")
            print("-------------------------------------------")


Epoch 0
Iteration 0 : 4.056656360626221
English: hey man , you wanna buy some weed ?
Tamil Translation: ஏய் மனிதனே, நீங்கள் கொஞ்சம் களை வாங்க விரும்புகிறீர்களா?
Tamil Prediction: ை ்் ்்்்்்்்  ்்      ்்்்்் ் ்      ்  ்்்்்் ் ் ்்்்்க் ்்்    ்க  ு  ்ு   ு்ு்் ் ்   ்் ் ்் ் ்்   ்்ு   ்்்்்்             ் ்       ் ்க ்கத ்ு ்             ் ்்்  ு்ு்்ு்்்்்் ்்்்்்்் ் ்
Evaluation translation (hello how are you?) : ('்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்கக்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ு்்்்்்்்்்்்்்்்்்்்்்்்்்க்்்்்்்்்்்்்்்்்கக்்்்ககக்்்்்்்்்ுுுக்்்்்ககுுு்்்்்்்்்்்்்்்்்ு்்்்்்ுு்்்்்்்்்்்்்்்்்்்்்',)
-------------------------------------------


KeyboardInterrupt: 

In [45]:
transformer.eval()
def translate(eng_sentence):
  eng_sentence = (eng_sentence,)
  tamil_sentence = ("",)
  for word_counter in range(max_sequence_length):
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, tamil_sentence)
    predictions = transformer(eng_sentence,
                              tamil_sentence,
                              encoder_self_attention_mask.to(device),
                              decoder_self_attention_mask.to(device),
                              decoder_cross_attention_mask.to(device),
                              enc_start_token=False,
                              enc_end_token=False,
                              dec_start_token=True,
                              dec_end_token=False)
    next_token_prob_distribution = predictions[0][word_counter]
    next_token_index = torch.argmax(next_token_prob_distribution).item()
    next_token = index_to_tamil[next_token_index]
    tamil_sentence = (tamil_sentence[0] + next_token, )
    if next_token == END_TOKEN:
      break
  return tamil_sentence[0]

In [None]:
translation = translate("what are you going to do?")
print(translation)

என்ன செய்யப் போகிறாய் ?<END>


In [46]:
import sys
sys.path.append("C:/Users/nithe/Unity/AR_Translator/Translator/Deepspeech/mic_vad_streaming")

from mic_vad_streaming import VADAudio


In [54]:
def translate_to_tamil(english_text, transformer_model, _, tamil_index_to_word, config):
    english_text = (english_text,)  # format as tuple
    tamil_sentence = ("",)
    for word_counter in range(config["max_sequence_length"]):
        enc_mask, dec_self_mask, dec_cross_mask = create_masks(english_text, tamil_sentence)
        predictions = transformer_model(
            english_text,
            tamil_sentence,
            enc_mask.to(config["device"]),
            dec_self_mask.to(config["device"]),
            dec_cross_mask.to(config["device"]),
            enc_start_token=False,
            enc_end_token=False,
            dec_start_token=True,
            dec_end_token=False
        )
        next_token_probs = predictions[0][word_counter]
        next_token_index = torch.argmax(next_token_probs).item()
        next_token = tamil_index_to_word[next_token_index]
        tamil_sentence = (tamil_sentence[0] + next_token,)
        if next_token == config["END_TOKEN"]:
            break
    return tamil_sentence[0]


In [55]:
def start_deepspeech_translating_from_notebook(model_path, scorer_path, transformer_model,
                                               english_to_index, tamil_index_to_word, config,
                                               device_index=None, rate=16000):
    import deepspeech
    import numpy as np
    from datetime import datetime
    from mic_vad_streaming import VADAudio
    import os
    import socket
    
    UDP_IP = "127.0.0.1"  # or the IP of the machine running Unity
    UDP_PORT = 5065       # port number Unity will listen on
    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)


    print('Initializing DeepSpeech model...')
    model = deepspeech.Model(model_path)
    if scorer_path:
        model.enableExternalScorer(scorer_path)

    vad_audio = VADAudio(aggressiveness=3, device=device_index, input_rate=rate)
    print("🎤 Listening... Speak into your mic (Ctrl+C to stop)")

    frames = vad_audio.vad_collector()
    stream_context = model.createStream()
    wav_data = bytearray()

    try:
        for frame in frames:
            if frame is not None:
                stream_context.feedAudioContent(np.frombuffer(frame, np.int16))
                wav_data.extend(frame)
            else:
                english_text = stream_context.finishStream()
                if english_text.strip():
                    print(f"\n🔊 Recognized (EN): {english_text}")
                    tamil_text = translate_to_tamil(
                        english_text.strip().lower(),
                        transformer_model,
                        english_to_index,
                        tamil_index_to_word,
                        config
                    )
                    print(f"🌐 Translated (TA): {tamil_text}")
                    sock.sendto(tamil_text.encode('utf-8'), (UDP_IP, UDP_PORT))
                    print(f"📡 Sent to Unity: '{tamil_text}' → {UDP_IP}:{UDP_PORT}")


                stream_context = model.createStream()
                wav_data = bytearray()
    except KeyboardInterrupt:
        print("🛑 Stopped listening")
        vad_audio.destroy()


In [56]:
config = {
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "max_sequence_length": max_sequence_length,  # this should already be defined in your code
    "END_TOKEN": END_TOKEN                       # this too should be defined earlier
}

In [41]:
# python mic_vad_streaming.py -m C:/Users/nithe/Unity/AR_Translator/Translator/Deepspeech/deepspeech-0.9.3-models.pbmm -s C:/Users/nithe/Unity/AR_Translator/Translator/Deepspeech/deepspeech-0.9.3-models.scorer

In [None]:
import sounddevice as sd
print(sd.query_devices())

In [57]:
start_deepspeech_translating_from_notebook(
    model_path="C:/Users/nithe/Unity/AR_Translator/Translator/Deepspeech/deepspeech-0.9.3-models.pbmm",
    scorer_path="C:/Users/nithe/Unity/AR_Translator/Translator/Deepspeech/deepspeech-0.9.3-models.scorer",
    transformer_model=transformer,
    english_to_index=None,
    tamil_index_to_word=index_to_tamil,
    config=config,
    device_index=3
)


Initializing DeepSpeech model...
🎤 Listening... Speak into your mic (Ctrl+C to stop)

🔊 Recognized (EN): how are you doing
🌐 Translated (TA): ந்்்     ்   ்்்்்்்்்்்்்்்்<END>
📡 Sent to Unity: 'ந்்்     ்   ்்்்்்்்்்்்்்்்<END>' → 127.0.0.1:5065
🛑 Stopped listening
