In [1]:
"""Clean the text by removing punctuation symbols and numbers, converting
characters to lowercase, and replacing Unicode characters with their ASCII
equivalents. For the French samples, insert [start] and [end] tokens at the
 beginning and end of each phrase"""
import pandas as pd
import re
from unicodedata import normalize

df = pd.read_csv('Data/en-fr.txt', names=['en', 'fr', 'attr'], usecols=['en', 'fr'], sep='\t')
df = df.sample(frac=1, random_state=42)
df = df.reset_index(drop=True)
df.head()

def clean_text(text):
    text = normalize('NFD', text.lower())
    text = re.sub('[^A-Za-z ]+', '', text)
    return text

def clean_and_prepare_text(text):
    text = '[start] ' + clean_text(text) + ' [end]'
    return text

df['en'] = df['en'].apply(lambda row: clean_text(row))
df['fr'] = df['fr'].apply(lambda row: clean_and_prepare_text(row))
df.head()

Unnamed: 0,en,fr
0,youre very clever,[start] vous etes fort ingenieuse [end]
1,are there kids,[start] y atil des enfants [end]
2,come in,[start] entrez [end]
3,wheres boston,[start] ou est boston [end]
4,you see what i mean,[start] vous voyez ce que je veux dire [end]


In [2]:
"""The next step is to scan the phrases and determine the maximum length of the
English phrases and then of the French phrases. These lengths will determine
the lengths of the sequences input to and output from the model"""
en = df['en']
fr = df['fr']

en_max_len = max(len(line.split()) for line in en)
fr_max_len = max(len(line.split()) for line in fr)
sequence_len = max(en_max_len, fr_max_len)

print(f'Max phrase length (English): {en_max_len}')
print(f'Max phrase length (French): {fr_max_len}')
print(f'Sequence length: {sequence_len}')

Max phrase length (English): 7
Max phrase length (French): 16
Sequence length: 16


In [3]:
import torch
from torchnlp.encoders.text import TreebankEncoder
from torchnlp.encoders.text import pad_tensor, stack_and_pad_tensors
from torchnlp.encoders.text import StaticTokenizerEncoder

def french_tokenize(text):
    text = text.replace('!', ' ').replace('"', ' ').replace('#', ' ') \
            .replace('$', ' ').replace('%', ' ').replace('&', ' ') \
            .replace('(', ' ').replace(')', ' ').replace('*', ' ') \
            .replace('+', ' ').replace(',', ' ').replace('-', ' ') \
            .replace('.', ' ').replace('/', ' ').replace(':', ' ') \
            .replace(';', ' ').replace('<', ' ').replace('=', ' ') \
            .replace('>', ' ').replace('?', ' ').replace('@', ' ') \
            .replace('\\', ' ').replace('^', ' ').replace('_', ' ') \
            .replace('`', ' ').replace('{', ' ').replace('|', ' ') \
            .replace('}', ' ').replace('~', ' ').replace('\t', ' ') \
            .replace('\n', ' ')
    return text.split()


def custom_pad_sequences(sequences, max_len, padding_value=0):
    padded_seqs = []
    for seq in sequences:
        if len(seq) < max_len:
            # Pad the sequence
            padded = torch.cat([seq, torch.tensor([padding_value] * (max_len - len(seq)), dtype=torch.long)])
        else:
            # Truncate if longer than max_len
            padded = seq[:max_len]
        padded_seqs.append(padded)
    return torch.stack(padded_seqs)


In [4]:
entokenizer = TreebankEncoder(en)
frtokenizer = StaticTokenizerEncoder(fr, tokenize=french_tokenize, append_eos=False, reserved_tokens=['<pad>'])


[nltk_data] Downloading package perluniprops to
[nltk_data]     /hpc/home/ma618/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /hpc/home/ma618/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


In [8]:
en_sequences = [torch.tensor(entokenizer.encode(sentence)) for sentence in en]
fr_sequences = [torch.tensor(frtokenizer.encode(sentence)) for sentence in fr]

# Pad the sequences to the desired length
en_x = custom_pad_sequences(en_sequences, sequence_len, padding_value=0)
fr_y = custom_pad_sequences(fr_sequences, sequence_len + 1, padding_value=0)

  en_sequences = [torch.tensor(entokenizer.encode(sentence)) for sentence in en]
  fr_sequences = [torch.tensor(frtokenizer.encode(sentence)) for sentence in fr]


In [9]:
"""Compute the vocabulary sizes from the Tokenizer instances"""
en_vocab_size = len(entokenizer.vocab) + 1
fr_vocab_size = len(frtokenizer.vocab) + 1

print(f'Vocabulary size (English): {en_vocab_size}')
print(f'Vocabulary size (French): {fr_vocab_size}')

Vocabulary size (English): 6038
Vocabulary size (French): 12198


In [10]:
"""Finally, create the features and the labels the model will be trained with.
The features are the padded English sequences and the padded French sequences
minus the [end] tokens. The labels are the padded French sequences minus the
[start] tokens. Package the features in a dictionary so they can be input to a
model that accepts multiple inputs."""
inputs = { 'encoder_input': en_x, 'decoder_input': fr_y[:, :-1] }
outputs = fr_y[:, 1:]

In [21]:
import importlib
import transformer
import numpy as np