In [1]:
#importing necessary dependencies

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense,LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
from google.colab import files
train_uploaded = files.upload()

Saving train.csv to train (2).csv


In [3]:
import io
data = pd.read_csv(io.BytesIO(train_uploaded['train.csv']))

In [4]:
dummy_data = data[:7000]

In [5]:
dummy_data.shape

(7000, 2)

In [6]:
dummy_data.head()

Unnamed: 0,text,intent
0,listen to westbam alumb allergic on google music,PlayMusic
1,add step to me to the 50 clásicos playlist,AddToPlaylist
2,i give this current textbook a rating value of...,RateBook
3,play the song little robin redbreast,PlayMusic
4,please add iris dement to my playlist this is ...,AddToPlaylist


In [7]:
sentences = list(dummy_data["text"])
unique_intent = list(set(dummy_data.intent))

In [8]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
#cleaning the text and tokenizing 

def cleaning(sentences):
  words = []
  for s in sentences :
    clean = re.sub(r'[^a-z A-Z 0-9]', " ",s)
    w = word_tokenize(clean)

    words.append([i.lower() for i in w])
  return words


In [10]:
cleaned_text = cleaning(sentences)
print(len(cleaned_text))
print(cleaned_text[:2])

7000
[['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music'], ['add', 'step', 'to', 'me', 'to', 'the', '50', 'cl', 'sicos', 'playlist']]


In [11]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [12]:
def max_length(words):
  return(len(max(words, key = len)))

In [13]:
word_tokenizer = create_tokenizer(cleaned_text)
vocab_size = len(word_tokenizer.word_index) + 1
max_len = max_length(cleaned_text)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_len))

Vocab Size = 7563 and Maximum length = 33


In [14]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [15]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_text)

In [16]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc , maxlen = max_len,padding="post"))

In [17]:
padded_doc = padding_doc(encoded_doc, max_len)
print(padded_doc[:5])

[[ 188    4 2841 2842 2843   17  184   35    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]
 [  11  361    4   19    4    1  456  870 1049   15    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]
 [   7   29   18   36   90    2   64  242    6   51   24    2  112   64
     6   20    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]
 [   9    1   46  426 1050 2844    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]
 [  55   11 1789 1790    4   13   15   18   10 2845    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]]


In [18]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (7000, 33)


In [19]:
unique_intent = list(set(dummy_data.intent))
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [20]:
output_tokenizer.word_index

{'addtoplaylist': 3,
 'bookrestaurant': 6,
 'getweather': 2,
 'playmusic': 4,
 'ratebook': 7,
 'searchcreativework': 1,
 'searchscreeningevent': 5}

In [21]:
encoded_output = encoding_doc(output_tokenizer, dummy_data.intent)

In [22]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)
encoded_output.shape

(7000, 1)

In [23]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [24]:
output_one_hot = one_hot(encoded_output)

In [25]:
output_one_hot.shape

(7000, 7)

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [27]:
print("Shape of train_X = %s and train_Y = %s" % (X_train.shape, Y_train.shape))
print("Shape of val_X = %s and val_Y = %s" % (X_test.shape, Y_test.shape))

Shape of train_X = (5600, 33) and train_Y = (5600, 7)
Shape of val_X = (1400, 33) and val_Y = (1400, 7)


In [29]:
def create_model(vocab_size , max_len):
  model = Sequential()
  model.add(Embedding(vocab_size , 128,input_length = max_len, trainable = False))
  model.add(Bidirectional(LSTM(128)))
  model.add(Dense(32,activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(7, activation = "softmax"))
  return model


In [30]:
model = create_model(vocab_size , max_len)

In [31]:
model.compile(loss = "categorical_crossentropy", optimizer = "adam" ,metrics = ["accuracy"])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 33, 128)           968064    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 231       
Total params: 1,239,687
Trainable params: 271,623
Non-trainable params: 968,064
_________________________________________________________________


In [32]:
filename = "intent_model.h5"
checkpoint = ModelCheckpoint(filename, monitor='val_loss',verbose=1,save_best_only=True, mode='min')

model_fit = model.fit(X_train, Y_train, epochs = 50, batch_size = 32, validation_data = (X_test, Y_test), callbacks = [checkpoint])

Train on 5600 samples, validate on 1400 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.91229, saving model to intent_model.h5
Epoch 2/50

Epoch 00002: val_loss improved from 0.91229 to 0.52463, saving model to intent_model.h5
Epoch 3/50

Epoch 00003: val_loss improved from 0.52463 to 0.41783, saving model to intent_model.h5
Epoch 4/50

Epoch 00004: val_loss improved from 0.41783 to 0.29331, saving model to intent_model.h5
Epoch 5/50

Epoch 00005: val_loss did not improve from 0.29331
Epoch 6/50

Epoch 00006: val_loss did not improve from 0.29331
Epoch 7/50

Epoch 00007: val_loss improved from 0.29331 to 0.24464, saving model to intent_model.h5
Epoch 8/50

Epoch 00008: val_loss improved from 0.24464 to 0.22935, saving model to intent_model.h5
Epoch 9/50

Epoch 00009: val_loss improved from 0.22935 to 0.21837, saving model to intent_model.h5
Epoch 10/50

Epoch 00010: val_loss improved from 0.21837 to 0.21800, saving model to intent_model.h5
Epoch 11/50

Epoch 00011: val

In [33]:
def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_len)
  
  pred = model.predict_proba(x)
  return pred

In [34]:
def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))

In [35]:
text = "add eye of the tiger to the list"
pred = predictions(text)
get_final_output(pred, unique_intent)

['add', 'eye', 'of', 'the', 'tiger', 'to', 'the', 'list']
AddToPlaylist has confidence = 1.0
GetWeather has confidence = 1.754686e-08
SearchScreeningEvent has confidence = 2.8798948e-09
RateBook has confidence = 1.5733876e-09
SearchCreativeWork has confidence = 1.2066022e-09
PlayMusic has confidence = 4.5615337e-10
BookRestaurant has confidence = 2.7473517e-11


In [36]:
text = "find me a table for 4"
pred = predictions(text)
get_final_output(pred, unique_intent)

['find', 'me', 'a', 'table', 'for', '4']
SearchCreativeWork has confidence = 0.9382009
BookRestaurant has confidence = 0.06174491
SearchScreeningEvent has confidence = 4.681304e-05
GetWeather has confidence = 6.978286e-06
RateBook has confidence = 2.9055872e-07
PlayMusic has confidence = 1.2627959e-07
AddToPlaylist has confidence = 4.451483e-10


In [37]:
text = "book me a table for 4"
pred = predictions(text)
get_final_output(pred, unique_intent)

['book', 'me', 'a', 'table', 'for', '4']
BookRestaurant has confidence = 0.99999857
SearchCreativeWork has confidence = 8.642264e-07
GetWeather has confidence = 5.855059e-07
PlayMusic has confidence = 9.900663e-10
SearchScreeningEvent has confidence = 2.7872996e-10
RateBook has confidence = 3.235957e-11
AddToPlaylist has confidence = 1.0417532e-12
