In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable 
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import unicodedata
import string
import re
import random
import time
import math
import os
from tqdm import tqdm
%matplotlib inline 

device = "cpu"

Define SOS, EOS, Length, Device and Helper Functions

In [3]:
SOS = 0
EOS = 1
MAX_LENGTH = 10

def convert_unicode_ascii(unicode_str):
    return ''.join(char for char in unicodedata.normalize('NFD', unicode_str) if unicodedata.category(char) != 'Mn')

def normalize_string(s):
    s = convert_unicode_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s) #remove punctuation
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) #remove non letter chars
    return s

Language Class. We can then manage Integer/String mappings. We need to do this for both languages.

In [4]:
class LanguageProcessor:
    def __init__(self, name):
        self.name = name
        self.stoi = {} #String to Integer
        self.itos = {0: "SOS", 1: "EOS"} #Integer to String 
        self.wordFrequency = {} #Word Frequency Mapping
        self.num_words = 2

    def index_word(self,word):
        if word not in self.stoi:
            #if not found, create mappings
            self.stoi[word] = self.num_words #unique index for each new word
            self.wordFrequency[word] = 1
            self.itos[self.num_words] = word
            self.num_words += 1
        else:
            #if found, increment cout
            self.stoi[word] += 1
    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)


Process the Data and prepare it using word pairings. 

In [6]:
def process_languages(lang1, lang2, reverse=False):
    file_path = 'fra.txt'
    if not os.path.exists(file_path):
        print(f"Error: File '{file_path}' not found.")
        return None, None, None

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.read().strip().split('\n')
    except IOError as e:
        print(f"Error reading file '{file_path}': {e}")
        return None, None, None
    except UnicodeDecodeError as e:
        print(f"Error decoding file '{file_path}': {e}")
        print("Try specifying a different encoding.")
        return None, None, None

    if not lines:
        print(f"Error: File '{file_path}' is empty.")
        return None, None, None

    try:
        pairs = [[normalize_string(s) for s in line.split('\t')] for line in lines]
    except Exception as e:
        print(f"Error processing lines: {e}")
        return None, None, None

    if reverse:
        #if reverse translation, reverse the orders
        pairs = [list(reversed(p)) for p in pairs]
        inputLang = LanguageProcessor(lang2)
        outputLang = LanguageProcessor(lang1)
    else:
        inputLang = LanguageProcessor(lang1)
        outputLang = LanguageProcessor(lang2)
    
    return inputLang, outputLang, pairs

# def filter_pair(p):
#     good_prefixes = (
#         "i am ", "i m ",
#         "he is ", "he s ",
#         "she is ", "she s ",
#         "you are ", "you re "
#     )
#     return (len(p[0].split(' ')) < MAX_LENGTH and 
#             len(p[1].split(' ')) < MAX_LENGTH and 
#             any(p[1].lower().startswith(prefix) for prefix in good_prefixes))

# def filter_pairs(pairs):
#     return [pair for pair in pairs if filter_pair(pair)]

def process_data(lang1, lang2, reverse=False):
    inputLang, outputLang, pairs = process_languages(lang1, lang2, reverse)
    if pairs is None:
        return None, None, None

    print(f"Read {len(pairs)} sentence pairs")
    pairs = pairs[:50000]
    print(f"Trimmed pair set to {len(pairs)} sentence pairs")

    if not pairs:
        print("Error: No pairs left after filtering.")
        return None, None, None

    for p in tqdm(pairs, desc="Indexing words"):
        inputLang.index_words(p[0]) #add first index of pair to input Lang
        outputLang.index_words(p[1]) #add second index of pair to output Lang
    print("Indexed words:")
    print(f"{inputLang.name}: {inputLang.num_words}")
    print(f"{outputLang.name}: {outputLang.num_words}")
    return inputLang, outputLang, pairs

inputLang, outputLang, pairs = process_data('eng', 'fra', True)
if pairs:
    print(random.choice(pairs))
else:
    print("Data processing failed.")
 

Read 229803 sentence pairs
Trimmed pair set to 0 sentence pairs
Error: No pairs left after filtering.
Data processing failed.
