In [None]:
!pip install --upgrade gensim
from gensim.models.fasttext import FastText
import numpy as np
%tensorflow_version 1.x
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 1.3MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3
TensorFlow 1.x selected.


Using TensorFlow backend.


In [None]:
import pandas as pd
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "utf-8",
       names = ["Sentence", "Intent"])
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)

In [None]:
intent, unique_intent, sentences = load_dataset("/content/CatagoricalIntents.csv")

In [None]:
import tensorflow as tf
print ("TensorFlow version: " + tf.__version__)

TensorFlow version: 1.15.2


In [None]:
nltk.download("punkt")
def cleaning(sentences):
  words = [] 
  for s in sentences:
    w = word_tokenize(s)
    words.append([i for i in w])     
  return words  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])  
print(len(sentences))

1567
[['நான்', '2in1', 'கணக்கில்', 'சேமிப்பது', 'எப்படி', '?'], ['2in1', 'கணக்கில்', 'நான்', 'சேமிப்பது', 'எப்படி', '?']]
1567


In [None]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [None]:
def max_length(words):
  return(len(max(words, key = len)))

In [None]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 1141 and Maximum length = 22


In [None]:
print(cleaned_words[0])

['நான்', '2in1', 'கணக்கில்', 'சேமிப்பது', 'எப்படி', '?']


In [None]:
embedding_size = 300
window_size = 40
min_word = 5
down_sampling = 1e-2

In [None]:
ft_model = FastText(cleaned_words,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

In [None]:
print(ft_model.wv['2in1'])
word_tokenizer.word_index.items()

[ 0.45624673  0.07033522 -0.23322707 -0.48546085 -0.01372106 -0.6082727
 -0.3131593   0.09224973  0.26917297  0.31518993 -0.14061175 -0.09439113
 -0.339316    0.2949583   0.23177084 -0.259394   -0.03287625 -0.33913615
 -0.08235746  0.13398245  0.3013094   0.3271468  -0.39975283 -0.3236494
  0.03857259 -0.1209999  -0.12897602  0.27902725 -0.00608854  0.09225049
 -0.30006635  0.04100168 -0.06102358 -0.15100484 -0.21715383 -0.18460332
 -0.16859087  0.03054993 -0.12335136  0.03482084 -0.30180815  0.30412665
 -0.3495495   0.28329828 -0.2954486  -0.51078266  0.04822594  0.22527137
  0.32289088 -0.09882434 -0.2943859   0.41832945 -0.01083656 -0.4120836
  0.3174567  -0.22432175 -0.3438079   0.03579382  0.3109133  -0.34825537
  0.04651307  0.07134812 -0.3262328  -0.01765268 -0.28858018  0.03841924
  0.34882674 -0.11676639 -0.00837123 -0.5219775  -0.17520565 -0.5401822
 -0.28187165  0.12310848  0.14714393 -0.407563   -0.23612057 -0.01540107
 -0.26487246 -0.19444713  0.27797553  0.40320683  0.104

dict_items([('?', 1), ('என்ன', 2), ('நான்', 3), ('கடன்', 4), ('எனது', 5), ('boc', 6), ('டிக்கிரி', 7), ('கணக்கை', 8), ('”', 9), ('எவ்வாறு', 10), ('எப்படி', 11), ('அட்டை', 12), ('அன்பளிப்புச்', 13), ('வெளிநாட்டு', 14), ('செலான்', 15), ('முடியும்', 16), ('அட்டையை', 17), ('நாணய', 18), ('“', 19), ('சீட்டு', 20), ('வேண்டும்', 21), ('எவை', 22), ('ஆவணங்கள்', 23), ('பெற', 24), ('.', 25), ('முடியுமா', 26), ('செய்ய', 27), ('தேவையான', 28), ('பெற்றுக்', 29), ('ஆரம்பிக்க', 30), ('fcaispe', 31), ('திறைசேரி', 32), ('சேமிப்பு', 33), ('ஆயுள்', 34), ('எந்த', 35), ('கொள்ள', 36), ('பற்றி', 37), ('எவ்வளவு', 38), ('தொழில்', 39), ('இலங்கை', 40), ('கீழ்', 41), ('நிலையான', 42), ('வழங்குனர்', 43), ('எனக்கு', 44), ('2in1', 45), ('வேண்டிய', 46), ('ஒன்றை', 47), ('புதிய', 48), ('தேவைப்படும்', 49), ('மாதாந்த', 50), ('கணக்கொன்றை', 51), ('கணக்கிற்கான', 52), ('கடனை', 53), ('குறைந்தபட்ச', 54), ('மற்றும்', 55), ('வகையின்', 56), ('வயது', 57), ('தொகை', 58), ('விற்பனை', 59), ('பெறுவது', 60), ('வேண்டுமா', 61), ('கணக்கில்', 6

In [None]:
word_index =word_tokenizer.word_index

In [None]:
#embedding matrix

print('preparing embedding matrix...')
words_not_found = []
nb_words = 1141

embedding_matrix = np.zeros((nb_words, embedding_size ))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = ft_model.wv[word]
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 1


In [None]:
print("sample words not found: ", words_not_found)

sample words not found:  []


In [None]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [None]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
encoded_output = encoding_doc(output_tokenizer, intent)
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)
encoded_output.shape


(1567, 1)

In [None]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [None]:
output_one_hot = one_hot(encoded_output)

In [None]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [None]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [None]:
padded_doc = padding_doc(encoded_doc, max_length)

In [None]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.3)
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (1096, 22) and train_Y = (1096, 10)
Shape of val_X = (471, 22) and val_Y = (471, 10)


In [None]:
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras import regularizers
from keras import optimizers

batch_size = 256 
num_epochs = 8 

#model parameters
num_filters = 64 
#embed_dim = 300 
weight_decay = 1e-4

num_classes = 10
print("training CNN ...")
model = Sequential()
model.add(Embedding(nb_words, embedding_size, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Conv1D(num_filters, 7, activation='linear', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='linear', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='linear', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(num_classes, activation='sigmoid'))  #multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()


training CNN ...
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 22, 300)           342300    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 22, 64)            134464    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 11, 64)            0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 11, 64)            28736     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 3

In [None]:
from keras.callbacks import EarlyStopping
#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [None]:
hist = model.fit(train_X, train_Y, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)


Train on 986 samples, validate on 110 samples
Epoch 1/8
 - 1s - loss: 0.6434 - accuracy: 0.6287 - val_loss: 0.4008 - val_accuracy: 0.9127
Epoch 2/8
 - 1s - loss: 0.4084 - accuracy: 0.8409 - val_loss: 0.2525 - val_accuracy: 0.9255
Epoch 3/8
 - 1s - loss: 0.3161 - accuracy: 0.8912 - val_loss: 0.2054 - val_accuracy: 0.9309
Epoch 4/8
 - 1s - loss: 0.2741 - accuracy: 0.9059 - val_loss: 0.1764 - val_accuracy: 0.9364
Epoch 5/8
 - 1s - loss: 0.2333 - accuracy: 0.9224 - val_loss: 0.1556 - val_accuracy: 0.9400
Epoch 6/8
 - 1s - loss: 0.1997 - accuracy: 0.9341 - val_loss: 0.1373 - val_accuracy: 0.9491
Epoch 7/8
 - 1s - loss: 0.1724 - accuracy: 0.9427 - val_loss: 0.1233 - val_accuracy: 0.9527
Epoch 8/8
 - 1s - loss: 0.1549 - accuracy: 0.9516 - val_loss: 0.1108 - val_accuracy: 0.9564


In [None]:
print(model.test_on_batch(val_X,val_Y))
model.metrics_names

[0.10818511, 0.96265066]


['loss', 'accuracy']

In [None]:
from imblearn.over_sampling import SMOTE



In [None]:
train_X.shape,train_Y.shape


((1096, 22), (1096, 10))

In [None]:
smote = SMOTE('minority')
X_sm, y_sm = smote.fit_sample(train_X,train_Y)



In [None]:
print(X_sm.shape,y_sm.shape)

(1409, 22) (1409, 10)


In [None]:
hist = model.fit(X_sm, y_sm, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

Train on 1268 samples, validate on 141 samples
Epoch 1/8
 - 1s - loss: 0.1726 - accuracy: 0.9442 - val_loss: 0.2106 - val_accuracy: 0.9262
Epoch 2/8
 - 1s - loss: 0.1447 - accuracy: 0.9491 - val_loss: 0.1130 - val_accuracy: 0.9582
Epoch 3/8
 - 1s - loss: 0.1271 - accuracy: 0.9582 - val_loss: 0.1085 - val_accuracy: 0.9603
Epoch 4/8
 - 1s - loss: 0.1158 - accuracy: 0.9590 - val_loss: 0.1025 - val_accuracy: 0.9681
Epoch 5/8
 - 1s - loss: 0.0986 - accuracy: 0.9667 - val_loss: 0.0778 - val_accuracy: 0.9780
Epoch 6/8
 - 1s - loss: 0.0864 - accuracy: 0.9722 - val_loss: 0.0645 - val_accuracy: 0.9809
Epoch 7/8
 - 1s - loss: 0.0794 - accuracy: 0.9743 - val_loss: 0.0602 - val_accuracy: 0.9823
Epoch 8/8
 - 1s - loss: 0.0728 - accuracy: 0.9775 - val_loss: 0.0537 - val_accuracy: 0.9830


In [None]:
print(model.test_on_batch(val_X,val_Y))
model.metrics_names

[0.044076752, 0.9864379]


['loss', 'accuracy']

In [None]:
from sklearn.utils import class_weight
y_train_labels = np.argmax(train_Y, axis =1)
class_weight = class_weight.compute_class_weight('balanced',np.unique(y_train_labels),y_train_labels)

In [None]:
hist = model.fit(X_sm, y_sm, batch_size=batch_size, epochs=num_epochs, class_weight=class_weight, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

Train on 1268 samples, validate on 141 samples
Epoch 1/8
 - 1s - loss: 0.0669 - accuracy: 0.9778 - val_loss: 0.0518 - val_accuracy: 0.9830
Epoch 2/8
 - 1s - loss: 0.0604 - accuracy: 0.9822 - val_loss: 0.0500 - val_accuracy: 0.9830
Epoch 3/8
 - 1s - loss: 0.0572 - accuracy: 0.9827 - val_loss: 0.0471 - val_accuracy: 0.9858
Epoch 4/8
 - 1s - loss: 0.0501 - accuracy: 0.9860 - val_loss: 0.0396 - val_accuracy: 0.9865
Epoch 5/8
 - 1s - loss: 0.0479 - accuracy: 0.9861 - val_loss: 0.0385 - val_accuracy: 0.9887
Epoch 6/8
 - 1s - loss: 0.0433 - accuracy: 0.9876 - val_loss: 0.0372 - val_accuracy: 0.9894
Epoch 7/8
 - 1s - loss: 0.0372 - accuracy: 0.9904 - val_loss: 0.0366 - val_accuracy: 0.9894
Epoch 8/8
 - 1s - loss: 0.0373 - accuracy: 0.9906 - val_loss: 0.0360 - val_accuracy: 0.9894
Epoch 00008: early stopping


In [None]:
print(model.test_on_batch(val_X,val_Y))
model.metrics_names

[0.024780676, 0.9928104]


['loss', 'accuracy']

In [None]:
hist = model.fit(train_X, train_Y, batch_size=batch_size, epochs=num_epochs, class_weight=class_weight, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

Train on 986 samples, validate on 110 samples
Epoch 1/8
 - 1s - loss: 0.0352 - accuracy: 0.9911 - val_loss: 0.0158 - val_accuracy: 0.9973
Epoch 2/8
 - 1s - loss: 0.0318 - accuracy: 0.9921 - val_loss: 0.0152 - val_accuracy: 0.9973
Epoch 3/8
 - 1s - loss: 0.0303 - accuracy: 0.9925 - val_loss: 0.0152 - val_accuracy: 0.9964
Epoch 4/8
 - 1s - loss: 0.0284 - accuracy: 0.9922 - val_loss: 0.0156 - val_accuracy: 0.9964
Epoch 5/8
 - 1s - loss: 0.0267 - accuracy: 0.9937 - val_loss: 0.0156 - val_accuracy: 0.9964
Epoch 00005: early stopping


In [None]:
print(model.test_on_batch(val_X,val_Y))
model.metrics_names

[0.020564195, 0.99432015]


['loss', 'accuracy']

In [None]:
model.evaluate(val_X,val_Y)



[0.020564194278289306, 0.9938428997993469]