[View in Colaboratory](https://colab.research.google.com/github/LunaLuan/mist/blob/master/RNN_for_NLP.ipynb)

**1. Utils:**

1 class to save dictionary...

In [0]:
from collections import defaultdict


class Vocab(object):
    def __init__(self):
        self.word_to_index = {}
        self.index_to_word = {}
        self.word_freq = defaultdict(int)
        self.total_words = 0
        self.unknown = '<unk>'
        self.add_word(self.unknown, count=0)

    def add_word(self, word, count=1):
        if word not in self.word_to_index:
            index = len(self.word_to_index)
            self.word_to_index[word] = index
            self.index_to_word[index] = word
        self.word_freq[word] += count

    def construct(self, words):
        for word in words:
            self.add_word(word)
        self.total_words = float(sum(self.word_freq.values()))
        print ('{} total words with {} uniques' \
                .format(self.total_words, len(self.word_freq)))

    def encode(self, word):
        if word not in self.word_to_index:
            word = self.unknown
        return self.word_to_index[word]

    def decode(self, index):
        return self.index_to_word[index]

    def __len__(self):
        return len(self.word_freq)


**2. Data evaluate and analysis: **

In [0]:
# # Import enviroments:
# # !pip install Faker
# !pip install keras
# !pip install tensorflow

In [0]:
from faker import Faker
fake = Faker("ja_JP")

print (fake.address())

愛媛県調布市皇居外苑15丁目22番12号 北青山クレスト039


In [0]:
vocab = Vocab()

for i in range(500):
    address = fake.address()
#     print address
    
    for c in address:
        vocab.add_word(c)
    
print (vocab.word_to_index)
print (len(vocab))

print (address[0])


{'<unk>': 0, '山': 1, '梨': 2, '県': 3, '長': 4, '生': 5, '郡': 6, '柄': 7, '町': 8, '台': 9, '場': 10, '6': 11, '丁': 12, '目': 13, '2': 14, '7': 15, '番': 16, '1': 17, '号': 18, ' ': 19, 'ク': 20, 'レ': 21, 'ス': 22, 'ト': 23, '無': 24, '栗': 25, '屋': 26, '8': 27, '熊': 28, '本': 29, '利': 30, '島': 31, '村': 32, '虎': 33, 'ノ': 34, '門': 35, 'ヒ': 36, 'ル': 37, 'ズ': 38, '森': 39, 'タ': 40, 'ワ': 41, 'ー': 42, '3': 43, '4': 44, '丹': 45, '勢': 46, 'ア': 47, 'バ': 48, 'ン': 49, '9': 50, '大': 51, '阪': 52, '府': 53, '網': 54, '白': 55, '里': 56, '市': 57, '竜': 58, '泉': 59, '0': 60, '5': 61, '福': 62, '岡': 63, '足': 64, '立': 65, '区': 66, '戸': 67, 'コ': 68, '太': 69, '田': 70, 'ヶ': 71, '谷': 72, '千': 73, '葉': 74, '代': 75, '上': 76, '広': 77, '方': 78, '京': 79, 'パ': 80, '愛': 81, '知': 82, '夷': 83, '隅': 84, '御': 85, '宿': 86, '高': 87, '輪': 88, '東': 89, '都': 90, '北': 91, '柿': 92, '木': 93, '沢': 94, '媛': 95, '日': 96, '野': 97, '鳥': 98, '越': 99, '滋': 100, '賀': 101, '横': 102, '浜': 103, '西': 104, '睦': 105, '手': 106, '岐': 107, '阜': 108, '調': 109, '布': 

In [0]:
"""
  Length of sentences
"""

import statistics


some_address = [fake.address() for i in range(5000)]
len_of_sentences = list(map(len, some_address))

print(min(len_of_sentences))
print(max(len_of_sentences)) 
print(statistics.mean(len_of_sentences)) 
print(statistics.median(len_of_sentences)) 

14
46
26.2924
28.0


**3. Data processing:**

- Create tokenizer function:

In [0]:
class Tokenizer:
  
  def __init__(self):
    self.vocab = Vocab()
  
  def add_sentence(self, sentence):
    for c in sentence:
      self.vocab.add_word(c)
      
  def text_to_sequence(self, text):
    return list(map(self.vocab.encode, text))
  
  def sequence_to_text(self, indices):
    return list(map(self.vocab.decode, indices))

tokenizer = Tokenizer()

# For testing
# for i in range(5000):
#   address = fake.address()
#   tokenizer.add_sentence(address)

# print (len(tokenizer.vocab))

# address = fake.address()
# print (address)

# sequence = (tokenizer.text_to_sequence(address))
# print (sequence)


# # print (tokenizer.vocab.word_to_index)
# text = (tokenizer.sequence_to_text(sequence))
# print (text)
  

- Add data to tokenizer:

In [0]:
fake = Faker("ja_JP")
for i in range(100000):
  address = fake.address()
  tokenizer.add_sentence(address)
  
print (tokenizer.vocab.word_to_index)
print (len(tokenizer.vocab))

{'<unk>': 0, '山': 1, '口': 2, '県': 3, '狛': 4, '江': 5, '市': 6, '平': 7, '須': 8, '賀': 9, '2': 10, '丁': 11, '目': 12, '番': 13, '号': 14, ' ': 15, '戸': 16, '塚': 17, '町': 18, 'ア': 19, 'ー': 20, 'バ': 21, 'ン': 22, '4': 23, '0': 24, '3': 25, '長': 26, '野': 27, '八': 28, '丈': 29, '島': 30, '高': 31, '輪': 32, '6': 33, '1': 34, 'シ': 35, 'テ': 36, 'ィ': 37, '北': 38, '上': 39, '9': 40, '大': 41, '阪': 42, '府': 43, '羽': 44, '村': 45, '四': 46, '区': 47, '5': 48, '香': 49, '川': 50, '横': 51, '浜': 52, '磯': 53, '子': 54, '松': 55, '浦': 56, '西': 57, '8': 58, '形': 59, '東': 60, '日': 61, '光': 62, '群': 63, '馬': 64, '墨': 65, '田': 66, '南': 67, '赤': 68, '神': 69, '奈': 70, '調': 71, '布': 72, '芝': 73, '中': 74, '鉢': 75, '石': 76, 'ャ': 77, 'ル': 78, 'ム': 79, '7': 80, '郷': 81, '屋': 82, '福': 83, '井': 84, '世': 85, '谷': 86, '橋': 87, '場': 88, 'パ': 89, 'レ': 90, 'ス': 91, '脚': 92, '折': 93, '梨': 94, '板': 95, '無': 96, '栗': 97, '三': 98, 'コ': 99, 'ポ': 100, '岐': 101, '阜': 102, '袖': 103, 'ケ': 104, '浅': 105, '草': 106, '静': 107, '岡': 108, '文': 109, '京': 

- Create data train, validation and test:

In [0]:
raw_data = [fake.address() for i in range(2)]
print (raw_data)

['香川県三鷹市箪笥町23丁目13番18号 太田ヶ谷パレス052', '長野県横浜市緑区湯本塩原26丁目15番11号 松石シティ706']


In [0]:
# train = list(map(tokenizer.text_to_sequence, [fake.address() for i in range(8000)]))
# validation = list(map(tokenizer.text_to_sequence, [fake.address() for i in range(1000)]))
# test = list(map(tokenizer.text_to_sequence, [fake.address() for i in range(1000)]))
data = list(map(tokenizer.text_to_sequence, [fake.address() for i in range(10000)]))
print (data[0])
# print (train[0: 2])

[76, 50, 3, 51, 52, 6, 163, 304, 47, 1, 74, 193, 66, 10, 34, 11, 12, 34, 34, 13, 23, 14]


In [0]:
# len_of_sentences = list(map(len, data))
# print (len_of_sentences)

# print (min(len_of_sentences))
# print (max(len_of_sentences))
# print (len(len_of_sentences))


- Preprocessing:

In [0]:
from keras.preprocessing.sequence import pad_sequences

padded_data = pad_sequences(data, padding='post')

print (list(map(len, data[0:10])))
print (list(map(len, padded_data[0:10])))

print (padded_data[0:2])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[22, 16, 18, 31, 20, 29, 28, 30, 27, 20]
[44, 44, 44, 44, 44, 44, 44, 44, 44, 44]
[[ 76  50   3  51  52   6 163 304  47   1  74 193  66  10  34  11  12  34
   34  13  23  14   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0]
 [187  30   3 302  30  47 252 253  23  11  12  10  25  13  23  14   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0]]


- One-hot vector generate:

In [0]:
from keras.utils import to_categorical

data_input = padded_data
data_output = to_categorical(padded_data)

print (data_input[0])
print (data_output[0])

[ 76  50   3  51  52   6 163 304  47   1  74 193  66  10  34  11  12  34
  34  13  23  14   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


**4. Build models:**

- Create model:

In [0]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense

import numpy as np


model = Sequential()
model.add(Embedding(
    input_dim=len(tokenizer.vocab),
    output_dim=200
))

model.add(Bidirectional(
    LSTM(512, activation="sigmoid", return_sequences=True)
))

model.add(Dense(len(tokenizer.vocab)))

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)


print (model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 200)         71800     
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 1024)        2920448   
_________________________________________________________________
dense_3 (Dense)              (None, None, 359)         367975    
Total params: 3,360,223
Trainable params: 3,360,223
Non-trainable params: 0
_________________________________________________________________
None


- Train model:

In [0]:
model.fit(
    x=data_input[0:9900],
    y=data_output[0:9900],
    batch_size=4,
    epochs=5
)


Epoch 1/5


In [0]:
model.evaluate(
    x=data_input[9900:],
    y=data_output[9900:]
)



[nan, 0.4998039293289185]