In [1]:
import torch
import torchtext

import re

import numpy as np
import pandas as pd
import nltk

import matplotlib.pyplot as plt
import random

from torchtext.data.metrics import bleu_score

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/shri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
if torch.backends.mps.is_available():
    DEVICE = torch.device(device='mps')
elif torch.cuda.is_available():
    DEVICE = torch.device(device='cuda')
else:
    DEVICE = torch.device(device='cpu')

In [5]:
train_df = pd.read_csv('data/train_full.csv')
test_df = pd.read_csv('data/test_full.csv')

In [6]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

### Preprocessing

In [7]:
def preprocessing_text(x: str) -> str:
  """Preprocess the text

  Args:
      x (str): Input String

  Returns:
      str: Processed String
  """
  x = x.lower().strip()
  x = re.sub(r"([.!?])", r" \1", x)
  x = re.sub(r"[^a-zA-Z!?]+", r" ", x)
  return x.strip()

In [8]:
train_df['English'] = train_df['English'].apply(preprocessing_text)
train_df['French'] = train_df['French'].apply(preprocessing_text)

test_df['English'] = test_df['English'].apply(preprocessing_text)
test_df['French'] = test_df['French'].apply(preprocessing_text)

In [9]:
MAX_SEQ_LENGTH = 25

def get_max_seq_length_data(data_frame: pd.DataFrame) -> pd.DataFrame:
  """Get Sentences of max sequence lenght

  Args:
      data_frame (pd.DataFrame): Input Data Frame will all data

  Returns:
      pd.DataFrame: Data Frame with data of max sequnce lenghth.
  """
  english = []
  french = []
  for row in data_frame.iterrows():
    if len(nltk.word_tokenize(row[1]['English'])) <= MAX_SEQ_LENGTH and len(nltk.word_tokenize(row[1]['English'])) >= 1 and len(nltk.word_tokenize(row[1]['French'])) <= MAX_SEQ_LENGTH and len(nltk.word_tokenize(row[1]['French'])) >= 1:
      english.append(row[1]['English'])
      french.append(row[1]['French'])
  return pd.DataFrame({'English': english, 'French': french})

In [10]:
train_df = get_max_seq_length_data(train_df)
test_df = get_max_seq_length_data(test_df)

In [11]:
class Language:
  """Class with language that maintains all the information of the language corpus
  """
  def __init__(self, language_name: str) -> None:
    """Initializing the language

    Args:
        language_name (str): language name
    """
    self.language_name = language_name
    self.word2index = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    self.word2count = {}
    self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
    self.n_words = 4

  def addSentence(self, sentence: str) -> None:
    """Process the sentence

    Args:
        sentence (str): Strings that we want to add to corpus
    """
    for word in nltk.word_tokenize(sentence):
      self.addWord(word)

  def addWord(self, word: str) -> None:
    """Add the word to corpus.

    Args:
        word (str): Input word that we want to add to corpus.
    """
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1

    else:
      self.word2count[word] += 1

In [12]:
english = Language('english')
french = Language('french')

In [13]:
for row in train_df.iterrows():
  english.addSentence(row[1]['English'])
  french.addSentence(row[1]['French'])

In [14]:
def create_data_from_vocab(sentence: str, language_corpus: 'Language') -> list[int]:
  """Create dataset from vocabulary

  Args:
      sentence (str): Input sentence that needs to be processed.
      language_corpus (Language): Reference language corpus

  Returns:
      list[int]: Sentence vector
  """
  sentence_vector = []
  for word in nltk.word_tokenize(sentence):
    if word not in language_corpus.word2index:
      sentence_vector.append(1)
    else:
      sentence_vector.append(language_corpus.word2index[word])
  sentence_vector.append(language_corpus.word2index['<EOS>'])

  return sentence_vector

In [15]:
train_df['English'] = train_df['English'].apply((lambda x: create_data_from_vocab(x, english)))
train_df['French'] = train_df['French'].apply((lambda x: create_data_from_vocab(x, french)))

test_df['English'] = test_df['English'].apply((lambda x: create_data_from_vocab(x, english)))
test_df['French'] = test_df['French'].apply((lambda x: create_data_from_vocab(x, french)))

In [16]:
def create_sentence_vectors(data_frame: pd.DataFrame) -> list:
  """_summary_

  Args:
      data_frame (pd.DataFrame): create dataset
  Returns:
      list: Dataset
  """
  x = []
  y = []
  for row in data_frame.iterrows():
    x.append(row[1]['English'])
    y.append(row[1]['French'])

  sorted_lists = sorted(zip(x,y), key=lambda x:len(x[0]))
  return zip(*sorted_lists)

In [17]:
train_x, train_y = create_sentence_vectors(train_df)
test_x, test_y = create_sentence_vectors(test_df)

#### Create Datasets and DataLoaders

In [18]:
class CustomDataset(torch.utils.data.Dataset):
    """Custom Dataset
    """
    def __init__(self, src_text: str, tgt_text: str):
        """Initiate Custom dataset.

        Args:
            src_text (str): input text
            tgt_text (str): target text
        """
        self.src_text = src_text
        self.tgt_text = tgt_text

    def __getitem__(self, index: int) -> tuple:
        """get an item

        Args:
            index (int): index number

        Returns:
            tuple: tuple of src text and target text
        """
        return self.src_text[index], self.tgt_text[index]

    def __len__(self) -> int:
        """Returns length of the dataset.

        Returns:
            int: length of the data set
        """
        return len(self.src_text)

In [19]:
train_dataset = CustomDataset(train_x, train_y)
test_dataset = CustomDataset(test_x, test_y)

In [20]:
def custom_collate_fn(batch):
    # Separate the source and target texts
    src_texts, tgt_texts = zip(*batch)

    # Step 1: Determine maximum sequence length
    max_length = max(max(len(seq) for seq in src_texts), max(len(seq) for seq in tgt_texts))

    # Step 2: Convert to tensors
    src_tensors = [torch.tensor(seq) for seq in src_texts]
    tgt_tensors = [torch.tensor(seq) for seq in tgt_texts]

    # Step 3: Pad sequences for source and target tensors
    padded_src_sequences = torch.nn.utils.rnn.pad_sequence(src_tensors, batch_first=True, padding_value=english.word2index['<PAD>'])
    padded_tgt_sequences = torch.nn.utils.rnn.pad_sequence(tgt_tensors, batch_first=True, padding_value=french.word2index['<PAD>'])

    return padded_src_sequences, padded_tgt_sequences


In [21]:
batch_size = 64
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, collate_fn=custom_collate_fn)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size, collate_fn=custom_collate_fn)