<a href="https://colab.research.google.com/github/HaozheTian/ColabProjects/blob/main/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import unicodedata,re

# 1. Dataset

## Read from Storage

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
!ls "/content/gdrive/MyDrive"

Mounted at /content/gdrive
 AdditiveAttention.ipynb   'Colab Notebooks'   TranslateData
 AttentionMechanism.ipynb   LeavesData	       translate_dataset.zip
 ClassifyLeaves.ipynb	    seq2seq.ipynb
 classify-leaves.zip	    torchmodels


In [None]:
!unzip "/content/gdrive/MyDrive/translate_dataset.zip" -d "/content/gdrive/MyDrive/TranslateData";

Archive:  /content/gdrive/MyDrive/translate_dataset.zip
   creating: /content/gdrive/MyDrive/TranslateData/data/
  inflating: /content/gdrive/MyDrive/TranslateData/data/eng-fra.txt  
   creating: /content/gdrive/MyDrive/TranslateData/data/names/
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/Arabic.txt  
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/Chinese.txt  
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/Czech.txt  
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/Dutch.txt  
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/English.txt  
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/French.txt  
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/German.txt  
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/Greek.txt  
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/Irish.txt  
  inflating: /content/gdrive/MyDrive/TranslateData/data/names/Italian.txt  
  inf

In [None]:
lines = open('/content/gdrive/MyDrive/TranslateData/data/eng-fra.txt', encoding='utf-8').read().strip().split('\n')
print(lines[:5])
for i in range(5): print(lines[i])

['Go.\tVa !', 'Run!\tCours\u202f!', 'Run!\tCourez\u202f!', 'Wow!\tÇa alors\u202f!', 'Fire!\tAu feu !']
Go.	Va !
Run!	Cours !
Run!	Courez !
Wow!	Ça alors !
Fire!	Au feu !


## Define Language Class

In [17]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name              #Name of the language
        self.word2index = {}            #Keys are Strings, values are Ints
        self.word2count = {}            #Count occurance of each word
        self.index2word = {0: "SOS", 1: "EOS"}  #Keys are Ints, values are Strings
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [18]:
my_eng = Lang("my_eng")
my_eng.addSentence("How nice is that")
print(my_eng.word2index["nice"])
print(my_eng.index2word[3])

3
nice


## Converting Unicode

In [19]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

## Prepare Data

In [20]:
fileName = "eng-fra.txt"
print("Reading File=>"+fileName)
lines = open('/content/gdrive/MyDrive/TranslateData/data/'+fileName, encoding='utf-8').read().strip().split('\n')
pairs = [l.split('\t') for l in lines]
print(pairs[0:3])
for pair in pairs:
  pair = [normalizeString(pair[0]),normalizeString(pair[1])]
print(pairs[0:3])

Reading File=>eng-fra.txt
[['Go.', 'Va !'], ['Run!', 'Cours\u202f!'], ['Run!', 'Courez\u202f!']]
[['Go.', 'Va !'], ['Run!', 'Cours\u202f!'], ['Run!', 'Courez\u202f!']]


In [35]:
input_lang = Lang("eng")
output_lang = Lang("fra")
for pair in pairs:
  input_lang.addSentence(pair[0])
  output_lang.addSentence(pair[1])

pairs = [pair for pair in pairs if len(pair[0].split(' ')) < 10 and len(pair[1].split(' ')) < 10]

In [36]:
print("For "+input_lang.name+":\n")
print(input_lang.n_words)
for i in range(2,5):
  print(i,input_lang.index2word[i])

import random
print(random.choice(pairs))
print(input_lang.word2index["ball"])

For eng:

25167
2 Go.
3 Run!
4 Wow!
['It looks like an apple.', 'Ça ressemble à une pomme.']
7927


# 2. Seq2Seq Model