<a href="https://colab.research.google.com/github/Mohadese-ghayoomi/Assignments-of-DL/blob/main/Assignment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing the required Libraries**

In [22]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

# **Loading the Dataset**

In [23]:
os.getcwd()

'/content/drive/My Drive/Colab Notebooks'

In [24]:
os.chdir('/content/drive/My Drive/Colab Notebooks')

In [25]:
from google.colab import drive


In [26]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z,اب پ ت س ج چ ح خ د ذ ر ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن و ه ی, ".", "?", "!", ",")
  #w = re.sub(r"[^a-zA-Z?.!,¿ا ب پ ت س ج چ ح خ د ذ ر ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن و ه ی]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [28]:
en_sentence = u"Get out!"
pes_sentence = u"برو بیرون!"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(pes_sentence).encode('utf-8'))

<start> get out ! <end>
b'<start> \xd8\xa8\xd8\xb1\xd9\x88 \xd8\xa8\xdb\x8c\xd8\xb1\xd9\x88\xd9\x86 ! <end>'


In [30]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, persian]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:-1] ]  for l in lines[:num_examples]]

  return zip(*word_pairs)

In [31]:
path_to_file = "pes.txt"
en, pes = create_dataset("pes.txt", 50)
print(en)
print(pes)

('<start> who ? <end>', '<start> go on . <end>', '<start> smile . <end>', '<start> attack ! <end>', '<start> got it ! <end>', '<start> i know . <end>', '<start> listen . <end>', '<start> really ? <end>', '<start> really ? <end>', '<start> why me ? <end>', '<start> be cool . <end>', '<start> be cool . <end>', '<start> be cool . <end>', '<start> come in . <end>', '<start> come on ! <end>', '<start> get out ! <end>', '<start> get out ! <end>', '<start> go away ! <end>', '<start> go away ! <end>', '<start> help me ! <end>', '<start> help me ! <end>', '<start> hold it ! <end>', '<start> see you . <end>', '<start> see you . <end>', '<start> shut up ! <end>', '<start> so long . <end>', '<start> take it . <end>', '<start> tell me . <end>', '<start> welcome . <end>', '<start> get away ! <end>', '<start> grab him . <end>', '<start> hurry up . <end>', '<start> keep out ! <end>', '<start> speak up ! <end>', '<start> terrific ! <end>', '<start> we agree . <end>', '<start> what for ? <end>', '<start

In [10]:
#This class allows to vectorize a text corpus, by turning each text into either a sequence of integers
#(each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
#Trains the model for a fixed number of epochs (iterations on a dataset)
  lang_tokenizer.fit_on_texts(lang)
#Transforms each text in texts to a sequence of integers.
#Each item in texts can also be a list, in which case we assume each item of that list to be a token.
  tensor = lang_tokenizer.texts_to_sequences(lang)
#Pads sequences to the same length
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post') #pad after each sequence

  return tensor, lang_tokenizer

In [11]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

#**Limitation the size of the dataset to experiment faster**

In [14]:
# Try experimenting with the size of that dataset
#num_examples of persian words = 2275
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, 2275)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [16]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

1820 1820 455 455


In [19]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [21]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[2])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[2])

Input Language; index to word mapping
1 ----> <start>
9 ----> او
2897 ----> چیزهایی
6 ----> را
16 ----> با
7 ----> من
13 ----> در
869 ----> میان
5 ----> می
759 ----> گذارد
11 ----> که
4 ----> به
335 ----> هیچکس
101 ----> دیگر
17 ----> نمی
460 ----> گوید
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
14 ----> he
2215 ----> confided
13 ----> in
25 ----> me
231 ----> things
14 ----> he
94 ----> would
100 ----> tell
76 ----> no
65 ----> one
537 ----> else
3 ----> .
2 ----> <end>


# **Create a tf.data dataset**