In [None]:
!pip install --upgrade gensim
from gensim.models.fasttext import FastText
import numpy as np
%tensorflow_version 1.x
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 1.3MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3
TensorFlow 1.x selected.


Using TensorFlow backend.


In [None]:
import pandas as pd
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "utf-8",
       names = ["Sentence", "Intent"])
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)

In [None]:
intent, unique_intent, sentences = load_dataset("/content/bank.csv")

In [None]:
nltk.download("punkt")
def cleaning(sentences):
  words = [] 
  for s in sentences:
    w = word_tokenize(s)
    words.append([i for i in w])     
  return words  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])  
print(len(sentences))

1567
[['நான்', '2in1', 'கணக்கில்', 'சேமிப்பது', 'எப்படி', '?'], ['2in1', 'கணக்கில்', 'நான்', 'சேமிப்பது', 'எப்படி', '?']]
1567


In [None]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [None]:
def max_length(words):
  return(len(max(words, key = len)))

In [None]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 1141 and Maximum length = 22


In [None]:
print(cleaned_words[0])

['நான்', '2in1', 'கணக்கில்', 'சேமிப்பது', 'எப்படி', '?']


In [None]:
embedding_size = 300
window_size = 40
min_word = 5
down_sampling = 1e-2

In [None]:
ft_model = FastText(cleaned_words,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

In [None]:
print(ft_model.wv['2in1'])
word_tokenizer.word_index.items()

[ 3.39925736e-01  7.49238655e-02 -6.95590079e-02 -5.32058120e-01
  2.64677424e-02 -5.34267426e-01 -2.58713096e-01  1.98973551e-01
  2.62593955e-01  3.81210208e-01 -1.23464279e-02 -2.41077200e-01
 -2.73910284e-01  1.77634373e-01  2.27735177e-01 -1.18934445e-01
 -6.08478040e-02 -2.74485737e-01 -1.63209215e-02  1.55432090e-01
  3.08730572e-01  3.34933877e-01 -3.13027054e-01 -2.59852141e-01
  4.22765724e-02 -4.59798127e-02 -6.45569190e-02  3.23049366e-01
  8.52603018e-02  4.60073352e-02 -3.56734484e-01  7.69669563e-02
 -1.14356972e-01  3.02679557e-02 -2.16333643e-01 -1.37601599e-01
 -1.62518919e-01  3.85744460e-02 -1.29629776e-01 -1.19700972e-02
 -3.91772568e-01  3.22847545e-01 -3.75775039e-01  2.55409926e-01
 -3.20515156e-01 -4.84329402e-01  1.73566174e-02  1.29060671e-01
  2.20555678e-01 -4.11303900e-02 -2.29317486e-01  3.82773995e-01
 -3.45704481e-02 -4.25408870e-01  3.46728176e-01 -3.74924213e-01
 -2.39186436e-01 -5.54286838e-02  3.40457737e-01 -3.21905822e-01
  4.63030897e-02  1.61250

dict_items([('?', 1), ('என்ன', 2), ('நான்', 3), ('கடன்', 4), ('எனது', 5), ('boc', 6), ('டிக்கிரி', 7), ('கணக்கை', 8), ('”', 9), ('எவ்வாறு', 10), ('எப்படி', 11), ('அட்டை', 12), ('அன்பளிப்புச்', 13), ('வெளிநாட்டு', 14), ('செலான்', 15), ('முடியும்', 16), ('அட்டையை', 17), ('நாணய', 18), ('“', 19), ('சீட்டு', 20), ('வேண்டும்', 21), ('எவை', 22), ('ஆவணங்கள்', 23), ('பெற', 24), ('.', 25), ('முடியுமா', 26), ('செய்ய', 27), ('தேவையான', 28), ('பெற்றுக்', 29), ('ஆரம்பிக்க', 30), ('fcaispe', 31), ('திறைசேரி', 32), ('சேமிப்பு', 33), ('ஆயுள்', 34), ('எந்த', 35), ('கொள்ள', 36), ('பற்றி', 37), ('எவ்வளவு', 38), ('தொழில்', 39), ('இலங்கை', 40), ('கீழ்', 41), ('நிலையான', 42), ('வழங்குனர்', 43), ('எனக்கு', 44), ('2in1', 45), ('வேண்டிய', 46), ('ஒன்றை', 47), ('புதிய', 48), ('தேவைப்படும்', 49), ('மாதாந்த', 50), ('கணக்கொன்றை', 51), ('கணக்கிற்கான', 52), ('கடனை', 53), ('குறைந்தபட்ச', 54), ('மற்றும்', 55), ('வகையின்', 56), ('வயது', 57), ('தொகை', 58), ('விற்பனை', 59), ('பெறுவது', 60), ('வேண்டுமா', 61), ('கணக்கில்', 6

In [None]:
word_index =word_tokenizer.word_index

In [None]:
#embedding matrix

print('preparing embedding matrix...')
words_not_found = []
nb_words = 1141

embedding_matrix = np.zeros((nb_words, embedding_size ))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = ft_model.wv[word]
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 1


In [None]:
print("sample words not found: ", words_not_found)

sample words not found:  []


In [None]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [None]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
encoded_output = encoding_doc(output_tokenizer, intent)
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)
encoded_output.shape


(1567, 1)

In [None]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [None]:
output_one_hot = one_hot(encoded_output)

In [None]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [None]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [None]:
padded_doc = padding_doc(encoded_doc, max_length)

In [None]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (1253, 22) and train_Y = (1253, 56)
Shape of val_X = (314, 22) and val_Y = (314, 56)


In [None]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(nb_words, embedding_size, weights=[embedding_matrix], input_length=max_length, trainable=False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(56, activation = "softmax"))
  
  return model

In [None]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 22, 300)           342300    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               439296    
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 56)                1848      
Total params: 791,668
Trainable params: 449,368
Non-trainable params: 342,300
_________________________________________________________________


In [None]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])


Train on 1253 samples, validate on 314 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.73180, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 2.73180 to 1.73894, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 1.73894 to 1.21313, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 1.21313 to 0.87712, saving model to model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 0.87712 to 0.64664, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 0.64664 to 0.48141, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 0.48141 to 0.42844, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 0.42844 to 0.40199, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss improved from 0.40199 to 0.25245, saving model to model.h5
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.25245
Epoch 11/100

Epoch 00011: val_loss improved 

In [None]:
model = load_model("model.h5")

In [None]:
print(model.test_on_batch(val_X,val_Y))
model.metrics_names

[0.0515158, 0.98407644]


['loss', 'accuracy']