<a href="https://colab.research.google.com/github/MariamAtefMah/Colab-ML-Project/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Sequence Modling**


*NLP(Natural Language Processing)*
  The four main layers for every project:
  _Simple RNN(): Recurrent neural network.
  _Embidding()
  _LSTM(): Long Short Term Memory.
  _GRU(): Gated Recurrent Unit.
  _Bidirection(), work with both direction from left to right and vise versa.


Main Architecture:
  _one to many ex. image caption.
  _many to one ex. sentiement classification.
  _Many to many has two cases:
    _input length equal to output length, ex. name entity recognation.
    _input length does not equal to output length, ex. Machine translation.

In [8]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection
import re # For regular expression, use it when you search about particular pattern like phone numbers.
import tqdm #provides a simple and convenient way to add progress bars to loops and iterable objects.

In [3]:
# this code to download kaggle.json in Colab.
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"mariamatefmah","key":"5f69e58464f1187994818c02382d5fbe"}'}

**Machine Translation**

In [4]:
#Those are the basic steps to download data from kaggle.
! mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [5]:
#This API command from kaggle to translate from english to spanish.
!kaggle datasets download -d lonnieqin/englishspanish-translation-dataset
# the file will be in content folder.

Downloading englishspanish-translation-dataset.zip to /content
  0% 0.00/2.72M [00:00<?, ?B/s]
100% 2.72M/2.72M [00:00<00:00, 134MB/s]


In [6]:
! unzip /content/englishspanish-translation-dataset.zip

Archive:  /content/englishspanish-translation-dataset.zip
  inflating: data.csv                


In [9]:
#store data frame.
data_df = pd.read_csv('/content/data.csv')
data_df

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
118959,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


In [10]:
#before deal with content we clean the text from any strange symbol and so on.
def clean_text(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>', '', text)
  text = re.sub('\n', '', text)
  text = re.sub(r'[^\w\s]', ' ', text)
  text = re.sub('\w*\d\w*', '', text)
  return text

In [11]:
#function to pass on every element in the table of english word to clean its text, using clean_text function we created.
'''
data_df['english'] = data_df['english'].apply(clean_text)
data_df['spanish'] = data_df['spanish'].apply(clean_text)
'''
data_df.english = data_df.english.map(clean_text)
data_df.spanish = data_df.spanish.map(clean_text)


In [12]:
data_df


Unnamed: 0,english,spanish
0,go,ve
1,go,vete
2,go,vaya
3,go,váyase
4,hi,hola
...,...,...
118959,there are four main causes of alcohol related ...,hay cuatro causas principales de muertes relac...
118960,there are mothers and fathers who will lie awa...,hay madres y padres que se quedan despiertos d...
118961,a carbon footprint is the amount of carbon dio...,una huella de carbono es la cantidad de contam...
118962,since there are usually multiple websites on a...,como suele haber varias páginas web sobre cual...


In [13]:
#Function to detemine the start and the end of the text.
def add_start_end(text):
  #add <start> at the beginning of the text and <end> at the end of text.
  text = f'<start> {text} <end>'
  return text

data_df.english = data_df.english.map(add_start_end)
data_df.spanish = data_df.spanish.map(add_start_end)

In [None]:
data_df

Unnamed: 0,english,spanish
0,<start> go <end>,<start> ve <end>
1,<start> go <end>,<start> vete <end>
2,<start> go <end>,<start> vaya <end>
3,<start> go <end>,<start> váyase <end>
4,<start> hi <end>,<start> hola <end>
...,...,...
118959,<start> there are four main causes of alcohol ...,<start> hay cuatro causas principales de muert...
118960,<start> there are mothers and fathers who will...,<start> hay madres y padres que se quedan desp...
118961,<start> a carbon footprint is the amount of ca...,<start> una huella de carbono es la cantidad d...
118962,<start> since there are usually multiple websi...,<start> como suele haber varias páginas web so...


In [14]:
#Tokenizer func. returns a Python generator of token objects.(encode the data)
def Tokenize(lang): #anguage
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', oov_token='<OOV>' #this is standard
      #we removed <> from filter to not clean it from text
  )
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang) #tensor to store sequence.
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post') #post to be after clean.
  lang_tokenizer.sequences = tensor  # Store the padded sequences
  return tensor, lang_tokenizer

In [15]:
#Here we store data as numbers sequence, and its token in tokenizer variable.
eng_sequence, eng_tokenizer = Tokenize(data_df.english)
sp_sequence, sp_tokenizer = Tokenize(data_df.spanish)


In [None]:
eng_sequence



**Store and Split data to train it**

In [16]:
#Split data into x_train and y_train and test for both.
#x_train for english words while y_train for spanish words.
x_train, x_test, y_train, y_test = model_selection.train_test_split(eng_sequence, sp_sequence,
                                                                test_size=0.2, random_state=42)
x_train.shape, y_train.shape

((95171, 49), (95171, 51))

In [17]:
#Another way to write convert func.
'''
def convert(lang, tensor):
  converted_data = [] #to store each word as an element in a list
  for t in tensor:
    if t !=0:
      converted_data.append(lang.index_word[t])
      #this line to create spaces before printing words.
  return converted_data

print('English')
english_data = convert(eng_tokenizer, x_train[0])
print(english_data)
print() #to print line between two languages.
print('Spanish')
spanish_data = convert(sp_tokenizer, y_train[0])
print(spanish_data)
'''

#Conversion of language
def convert(lang, tensor):
  for t in tensor:
    if t !=0:
      print('%d---> %s' % (t, lang.index_word[t]))
      #this line to create spaces before printing words.

print('English')
convert(eng_tokenizer, x_train[0])
print() #to print line.
print('Spanish')
convert(sp_tokenizer, y_train[0])




English
2---> <start>
4---> i
20---> have
73---> no
739---> choice
42---> at
59---> all
3---> <end>

Spanish
2---> <start>
7---> no
46---> tengo
177---> otra
1350---> opción
11---> en
1694---> absoluto
3---> <end>


In [18]:
#Repeare the parametres of the model.
vocab_inp_size = len(eng_tokenizer.word_index)+1 #input size for english
vocab_tar_size = len(sp_tokenizer.word_index)+1 #output/target size for spanish
embidding_dim = 256 #hidden layer dimension
units = 1024
batch_size = 32

In [19]:
#Recreating data as array of numbers.
def create_dataset(x, y, batch_size=32):
  data = tf.data.Dataset.from_tensor_slices((x, y)) #tensor slices for the previous 2 funcs.

  #to mix 1028 texts so that no repeate will occure.
  data = data.shuffle(1028)
  #split data to smaller units.
  data = data.batch(batch_size, drop_remainder=True) #If the rest of the text does not equal 33(batch_size), then drop it.
  #to repaire the rest of data.
  data = data.prefetch(tf.data.experimental.AUTOTUNE)

  return data

#Retrain data again after creating it with new function as an array of numbers.
train_dataset = create_dataset(x_train, y_train)
test_dataset = create_dataset(x_test, y_test)

In [20]:
#Print the data of first element
for eng, sp in train_dataset.take(1):
  print(f'english:{eng.shape}\n {eng}')
  print(f'spanish:{sp.shape}\n {sp}')


english:(32, 49)
 [[   2    8   49 ...    0    0    0]
 [   2    6  136 ...    0    0    0]
 [   2    4   37 ...    0    0    0]
 ...
 [   2    4  115 ...    0    0    0]
 [   2   46 6400 ...    0    0    0]
 [   2    4   89 ...    0    0    0]]
spanish:(32, 51)
 [[    2     8    15 ...     0     0     0]
 [    2    29  4018 ...     0     0     0]
 [    2    45     5 ...     0     0     0]
 ...
 [    2  1163    19 ...     0     0     0]
 [    2 18148    32 ...     0     0     0]
 [    2   150     9 ...     0     0     0]]


*Subclassing API*

In [21]:
class Endcoder(tf.keras.Model):
  #constructor function.
  def __init__(self, vocab_size, embidding_dim, encoder_units, batch_size): #kwargs for more inputs
    super(Endcoder, self).__init__()

    self.batch_size = batch_size
    self.encoder_units = encoder_units
    self.embidding = tf.keras.layers.Embedding(vocab_size, embidding_dim, mask_zero=True)
    self.gru = tf.keras.layers.GRU(self.encoder_units,
                                         return_sequences=True,
                                         return_state=True,
                                         recurrent_initializer='glorot_uniform')

    # #we use self as all this belong to an object not variable.
    # self.input_layer = tf.keras.Input(shape=[input_shape])
    # # can be written like that, hidden1 = tf.keras.layers.Dense(units=unints, activation='relu')(hidden1)
    # self.hidden1 = tf.keras.layers.Dense(units=unints, activation=act)
    # self.dropout1 = tf.keras.layers.Dropout(0.2)
    # self.hidden2 = tf.keras.layers.Dense(units=unints, activation=act)
    # self.dropout2 = tf.keras.layers.Dropout(0.2)
    # self.output_layer = tf.keras.layers.Dense(units=10, activation=tf.nn.softmax)

  #function to pass each layer to its place.
  def call(self, x, hidden):
    # Convert inputs to float32, to match with tensorflow dtype.
    #inputs = tf.cast(x, tf.float32)
    x = self.embidding(x)
    output, next_state = self.gru(x, initial_state = hidden)
    return output, next_state

  #Fucntion to initialize all values to be zeros.
  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.encoder_units)) #zeros((row, column))


In [22]:
class Decoder(tf.keras.Model):
  #constructor function.
  def __init__(self, vocab_size, embidding_dim, decoder_units, batch_size): #kwargs for more inputs
    super(Decoder, self).__init__()

    self.batch_size = batch_size
    self.decoder_units = decoder_units
    self.embidding = tf.keras.layers.Embedding(vocab_size, embidding_dim, mask_zero=True)
    self.gru = tf.keras.layers.GRU(self.decoder_units,
                                         return_sequences=True,
                                         return_state=True,
                                         recurrent_initializer='glorot_uniform')
    self.final = tf.keras.layers.Dense(vocab_size)
  #function to pass each layer to its place.
  def call(self, x, hidden):
    # Convert inputs to float32, to match with tensorflow dtype.
    #inputs = tf.cast(x, tf.float32)
    x = self.embidding(x)
    output, hidden = self.gru(x, initial_state = hidden)
    output = tf.reshape(output, (-1, output.shape[2])) #reshape output to be any row with 2 column
    output = self.final(output)
    return output, hidden


In [23]:
# vocab_inp_size = len(eng_tokenizer.word_index)+1 #input size for english
# vocab_tar_size = len(sp_tokenizer.word_index)+1 #output size for spanish
# embidding_dim = 256 #hidden layer dimension
# units = 1024
# batch_size = 32

#first create an object encoder of class Encoder.
encoder = Endcoder(vocab_inp_size, embidding_dim, units, batch_size)
#initialize all values to be zeros.
sample_hidden = encoder.initialize_hidden_state()
#call the call function to give the output encoded data.
sample_output, sample_hidden = encoder.call(eng, sample_hidden)

#Now print english data.
print('Encoder output shape: (batch_size, sequence_k=length, units) {}'.format(sample_output.shape))
print('Encoder hidden state shape: (batch_size, units) {}'.format(sample_hidden.shape))



Encoder output shape: (batch_size, sequence_k=length, units) (32, 49, 1024)
Encoder hidden state shape: (batch_size, units) (32, 1024)


In [24]:
#first create an object encoder of class Encoder.
decoder = Decoder(vocab_tar_size, embidding_dim, units, batch_size)
#call the call function to give the output encoded data, if the variable is not important we call it underscore(_)
sample_decoder_output, _ = decoder.call(tf.random.uniform((batch_size, 1)), sample_hidden) #(batch_size, 1) take sapce by 1.

#Now print english data.
print('Decoder output shape: (batch_size, vocab_size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab_size) (32, 25769)


Create the optimizer and loss function:

In [26]:
#create optimizer:
optimizer = tf.keras.optimizers.Adam()
#create the loss function:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

#define the loss function for the tranining:
def loss_fun (real, pred): #real value and prediction value.
  #cerate the mask toignore the padding tokens:
  mask = tf.math.logical_not(tf.math.equal(real,0))
  #loss_ is a variable that represents the result of applying a loss function to
  #compare the predicted output (pred) with the actual output (real).
  loss_ = loss_object(real, pred)
  #if valie is 1 the loss is calculated, if 0 not calculated.
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_*=mask

  return tf.reduce_mean(loss_)




In [None]:
#create the training metric:
train_loss = tf.metrics.Mean(name='train loss')
#create the testing metric:
test_loss = tf.metrics.Mean(name='test loss')


**create training steps**

computing Gradiants using Autodiff

In [27]:
# using tf.function decorating to speed the process
@tf.function
def train_steps(inputs, target, enc_hidden):
  #encoder_hidden is the initial hidden state of the encodr.
  loss = 0
  #to create the gradiant of the loss.
  with tf.GradientTape as tap:
    enc_output, enc_hidden = encoder(inputs, enc_hidden)
    dec_hidden = enc_hidden

    dec_inputs = tf.expand.dims([sp_tokenizer.word_index['<start>']] * inputs.shape[0], 1)

    for t in range(1, target.shape[1]):
      predictions, dec_hidden = decoder(dec_inputs, dec_hidden)
      loss += loss_fun(target[:, t], predictions)
      #using teacher forcing.
      dec_inputs = tf.expand.dims(target[:, t], 1)

    batch_loss = (loss / int(target.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradiants = tap.gradiants(loss, variables)

    optimizer.apply_gradients(zip(gradiants, variables))

    train_loss(batch_loss)
    return batch_loss


create the test step function

In [28]:
# using tf.function decorating to speed the process
@tf.function
def test_steps(inputs, target, enc_hidden):
  #encoder_hidden is the initial hidden state of the encodr.
  loss = 0
  #to create the gradiant of the loss.
  with tf.GradientTape as tap:
    enc_output, enc_hidden = encoder(inputs, enc_hidden)
    dec_hidden = enc_hidden

    dec_inputs = tf.expand.dims([sp_tokenizer.word_index['<start>']] * inputs.shape[0], 1)

    for t in range(1, target.shape[1]):
      predictions, dec_hidden = decoder(dec_inputs, dec_hidden)
      loss += loss_fun(target[:, t], predictions)
      #using teacher forcing.
      dec_inputs = tf.expand.dims(target[:, t], 1)

    batch_loss = (loss / int(target.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradiants = tap.gradiants(loss, variables)

    optimizer.apply_gradients(zip(gradiants, variables))

    train_loss(batch_loss)
    return batch_loss


create the training loop

In [None]:
EPOCHS = 3

old_test_loss = 1000000 #1 million

#create the training loop:
for epoch in range(EPOCHS):
  train_loss.reset_states()
  test_loss.reset_states()

  enc_hidden = encoder.initialize_hidden_state()
  steps_per_epoch = eng_sequence.shape[0] // batch_size
  bar = tf.keras.utils.Progbar(target=steps_per_epoch)

  count = 0
  #iterate over the training set:
  for (batch, (inputs, target)) in enumerate(train_dataset):
    count += 1

    #run the training step:
    batch_loss = train_steps(inputs, target, enc_hidden)
    bar.update(count)

  #iterate over the testing set:
  for (batch, (inputs, target)) in enumerate(test_dataset):
