<a href="https://colab.research.google.com/github/Jorgecardetegit/NLP/blob/main/English_French_Translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import cv2

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (Dense,Flatten,SimpleRNN,InputLayer,Conv1D,Bidirectional,GRU,LSTM,BatchNormalization,Dropout,Input, Embedding,TextVectorization)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam

import tensorflow_datasets as tfds
import tensorflow_probability as tfp

from tensorboard.plugins import projector

In [3]:
import sklearn
from sklearn.metrics import confusion_matrix, roc_curve

In [4]:
import io
import os
import re
import string
import time
import datetime
import pathlib

In [5]:
from google.colab import drive
from google.colab import files

# Dataset

### Manythings dataset

In [6]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2023-10-21 13:28:41--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7757635 (7.4M) [application/zip]
Saving to: ‘fra-eng.zip.1’


2023-10-21 13:28:42 (11.4 MB/s) - ‘fra-eng.zip.1’ saved [7757635/7757635]



In [7]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

Archive:  /content/fra-eng.zip
replace /content/dataset/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [8]:
text_dataset=tf.data.TextLineDataset("/content/dataset/fra.txt")

### Kaggle dataset

In [9]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset

mkdir: cannot create directory ‘/root/.kaggle’: File exists
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 403, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [10]:
!unzip "/content/en-fr-translation-dataset.zip" -d "/content/dataset/"

unzip:  cannot find or open /content/en-fr-translation-dataset.zip, /content/en-fr-translation-dataset.zip.zip or /content/en-fr-translation-dataset.zip.ZIP.


In [11]:
dataset = tf.data.experimental.CsvDataset("/content/dataset/en-fr.csv",
 [tf.string,tf.string],) # List that specifies the data types of the columns in the CSV file


# Basic EDA

In [8]:
text_dataset=tf.data.TextLineDataset("/content/dataset/fra.txt")

The tf.data.TextLineDataset reads lines from the text file lazily, which means it doesn't load the entire file into memory. As a result, there's no direct method on the TextLineDataset object to get the total number of lines (or length) immediately.

In [9]:
count_text_dataset = 0
for _ in text_dataset:
    count_text_dataset += 1

In [11]:
print(f"The dataset has a total of {count_text_dataset} terms")

The dataset has a total of 227815 terms


Now lets find the number of words in each element and get some descriptive statistics.

In [None]:
word_counts = [len(tf.strings.split(i, " ")) for i in text_dataset]

In [None]:
import numpy as np

max_len = np.max(word_counts)
mean_len = np.mean(word_counts)
median_len = np.median(word_counts)
percentile_95 = np.percentile(word_counts, 95)
percentile_99 = np.percentile(word_counts, 99)

In [None]:
import matplotlib.pyplot as plt

plt.hist(word_counts, bins=50, edgecolor='k')
plt.axvline(mean_len, color='r', linestyle='dashed', linewidth=1, label='Mean')
plt.axvline(median_len, color='g', linestyle='dashed', linewidth=1, label='Median')
plt.axvline(percentile_95, color='b', linestyle='dashed', linewidth=1, label='95th Percentile')
plt.legend()
plt.title("Distribution of Sequence Lengths")
plt.xlabel("Sequence Length")
plt.ylabel("Frequency")
plt.show()

# Data processing

## Preprocessing class

In [12]:
VOCAB_SIZE = 20000
ENGLISH_SEQUENCE_LENGTH = 64
FRENCH_SEQUENCE_LENGTH = 64
EMBEDDING_DIM = 300
BATCH_SIZE = 64

english_vectorize_layer = TextVectorization(
  standardize='lower_and_strip_punctuation',                     # Convert to lowercase and remove punctuation
  max_tokens= VOCAB_SIZE,                                        # Vocab contains 20000 terms and additional tokens will be treated as out-of-vocabulary (OOV)
  output_mode='int',                                             # Tokens will be represented as an integer corresponding to the token's index in the vocabulary
  output_sequence_length = ENGLISH_SEQUENCE_LENGTH)              # Sequence length of 64 (same as the french dataset sequence length)

french_vectorize_layer = TextVectorization(                      # It´s important to include a layer for each category even it seems both layers are doing the same. Later on we will have to adapt each layer to its class
  standardize='lower_and_strip_punctuation',
  max_tokens= VOCAB_SIZE,
  output_mode='int',
  output_sequence_length = FRENCH_SEQUENCE_LENGTH)

In [13]:
class preprocessing:

  def __init__(self):
    self.VOCAB_SIZE = VOCAB_SIZE
    self.FRENCH_SEQUENCE_LENGTH = FRENCH_SEQUENCE_LENGTH
    self.ENGLISH_SEQUENCE_LENGTH = ENGLISH_SEQUENCE_LENGTH
    self.EMBEDDING_DIM = EMBEDDING_DIM
    self.BATCH_SIZE = BATCH_SIZE

    self.english_vectorize_layer = english_vectorize_layer
    self.french_vectorize_layer = french_vectorize_layer

  def selector(self, input_text):
    split_text=tf.strings.split(input_text,'\t')
    return {'input_1':split_text[0:1],'input_2':'starttoken '+split_text[1:2]},split_text[1:2]+' endtoken'

  def separator(self, input_text):
    split_text=tf.strings.split(input_text,'\t')
    return split_text[0:1],'starttoken '+split_text[1:2]+' endtoken'

  def vectorizer(self, inputs, output):
    return {'input_1':english_vectorize_layer(inputs['input_1']),
            'input_2':french_vectorize_layer(inputs['input_2'])},french_vectorize_layer(output)

prep_object = preprocessing()

## 1. Split dataset

In [14]:
split_dataset = text_dataset.map(prep_object.selector)
init_dataset = text_dataset.map(prep_object.separator)    # Separate french and english label.

In [15]:
for i in split_dataset.take(1): print(i)
print(" ")
for i in text_dataset.take(1): print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
 
tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)


## 2. Divide the dataset

In [16]:
english_training_data=init_dataset.map(lambda x,y:x)
french_training_data=init_dataset.map(lambda x,y:y)

## 3. Adapt the vectorize layers

In [17]:
english_vectorize_layer.adapt(english_training_data)
french_vectorize_layer.adapt(french_training_data)

## 4. Vectorize the dataset

In [18]:
dataset = split_dataset.map(prep_object.vectorizer)

## 5. Shuffle and Prefetch

In [19]:
dataset=dataset.shuffle(2048).unbatch().batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

NUM_BATCHES=int(200000/BATCH_SIZE)

## 6. Split in training and validation

In [20]:
train_dataset=dataset.take(int(0.9*NUM_BATCHES))
val_dataset=dataset.skip(int(0.9*NUM_BATCHES))

# Model architecture

## Seq2Seq (GRU)

In [21]:
NUM_UNITS = 256

### Encoder
input = Input(shape=(ENGLISH_SEQUENCE_LENGTH,), dtype="int64", name="input_1")
x=Embedding(VOCAB_SIZE, EMBEDDING_DIM, )(input)
encoded_input=Bidirectional(GRU(NUM_UNITS), )(x)

### DECODER
shifted_target=Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")
x=Embedding(VOCAB_SIZE,EMBEDDING_DIM,)(shifted_target)
x = GRU(NUM_UNITS*2, return_sequences=True)(x, initial_state=encoded_input)

### OUTPUT
x = Dropout(0.5)(x)
target=Dense(VOCAB_SIZE,activation="softmax")(x)
seq2seq_gru=Model([input,shifted_target],target)

In [22]:
seq2seq_gru.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 64, 300)              6000000   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 64, 300)              6000000   ['input_2[0][0]']             
                                                                                              

# Model training

## Bleu scoring class

In [23]:
class BLEU(tf.keras.metrics.Metric):
    def __init__(self,name='bleu_score'):
        super(BLEU,self).__init__()
        self.bleu_score=0

    def update_state(self,y_true,y_pred,sample_weight=None):
      y_pred=tf.argmax(y_pred,-1)
      self.bleu_score=0
      for i,j in zip(y_pred,y_true):
        tf.autograph.experimental.set_loop_options()

        total_words=tf.math.count_nonzero(i)
        total_matches=0
        for word in i:
          if word==0:
            break
          for q in range(len(j)):
            if j[q]==0:
              break
            if word==j[q]:
              total_matches+=1
              j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
              break

        self.bleu_score+=total_matches/total_words

    def result(self):
        return self.bleu_score/BATCH_SIZE

In [24]:
seq2seq_gru.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),     # Used because outputs are not one hot representations
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=[BLEU(), "accuracy"],
    run_eagerly=True)                                         # Method has to be set to eager excecution.

In [None]:
history=seq2seq_gru.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5)

Epoch 1/5


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: not enough values to unpack (expected 2, got 0)


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: not enough values to unpack (expected 2, got 0)
    588/Unknown - 986s 2s/step - loss: 1.2424 - bleu: 0.2365 - accuracy: 0.9221

# Model evaluation

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## Testing

In [None]:
index_to_word={x:y for x, y in zip(range(len(french_vectorize_layer.get_vocabulary())),
                                   french_vectorize_layer.get_vocabulary())}

In [None]:
def translator(english_sentence):
  tokenized_english_sentence=english_vectorize_layer([english_sentence])
  shifted_target='starttoken'

  for i in range(FRENCH_SEQUENCE_LENGTH):
    tokenized_shifted_target=french_vectorize_layer([shifted_target])
    output=seq2seq_gru.predict([tokenized_english_sentence,tokenized_shifted_target])
    french_word_index=tf.argmax(output,axis=-1)[0][i].numpy()
    current_word=index_to_word[french_word_index]
    if current_word=='endtoken':
      break
    shifted_target+=' '+current_word
  return shifted_target[11:]