In [2]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential,Model, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding,Input
from keras.layers import Dropout,GlobalAveragePooling1D,GlobalMaxPooling1D,concatenate,LeakyReLU
from keras.layers import SpatialDropout1D,Conv1D,MaxPooling1D,GRU,BatchNormalization
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K


In [4]:
def load_data(filename):
    df=pd.read_csv(filename,encoding='latin1',names=['Intent','Sentences'])
    print(df.head())
    intent=df['Intent']
    unique_intent=list(set(intent))
    sentences=list(df['Sentences'])
    
    return (intent,unique_intent,sentences)

In [5]:
intent,unique_intent,sentences=load_data('/home/administrator/Downloads/intent_classification/atis_intents_train.csv')

             Intent                                          Sentences
0       atis_flight   i want to fly from boston at 838 am and arriv...
1       atis_flight   what flights are available from pittsburgh to...
2  atis_flight_time   what is the arrival time in san francisco for...
3      atis_airfare            cheapest airfare from tacoma to orlando
4      atis_airfare   round trip fares from pittsburgh to philadelp...


In [6]:
intent

0            atis_flight
1            atis_flight
2       atis_flight_time
3           atis_airfare
4           atis_airfare
              ...       
4829        atis_airfare
4830         atis_flight
4831        atis_airline
4832         atis_flight
4833         atis_flight
Name: Intent, Length: 4834, dtype: object

In [7]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
stemmer=LancasterStemmer()
stop_words = set(stopwords.words('english'))

In [9]:
def cleaning(sentences):
    words=[]
    for s in sentences:
        clean = re.sub(r'[^a-zA-Z0-9]', " ",s)
        w=word_tokenize(clean)
        tokens_without_sw = [word for word in w if not word in stop_words]
        words.append([stemmer.stem(i.lower()) for i in tokens_without_sw])

    return words           

In [10]:
cleaned_word=cleaning(sentences)
print(len(cleaned_word))
print(cleaned_word[:2])

4834
[['want', 'fly', 'boston', '838', 'ar', 'denv', '1110', 'morn'], ['flight', 'avail', 'pittsburgh', 'baltim', 'thursday', 'morn']]


In [11]:
def creat_tokenizer(words,filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    tocken=Tokenizer(filters=filters)
    tocken.fit_on_texts(words)
    return tocken

In [12]:
def max_len(words):
    return (len(max(words,key=len)))

In [13]:
word_tockenizer=creat_tokenizer(cleaned_word)
vocab_size=len(word_tockenizer.word_index)+1
max_len=max_len(cleaned_word)

print("Vocab Size= %d and Max length= %d" % (vocab_size,max_len))

Vocab Size= 630 and Max length= 30


In [14]:
def encoding_doc(tocken,words):
    return (tocken.texts_to_sequences(words))

In [15]:
encoded_doc=encoding_doc(word_tockenizer,cleaned_word)

In [16]:
encoded_doc[:5]

[[40, 20, 2, 290, 17, 5, 364, 21],
 [1, 35, 8, 10, 44, 21],
 [17, 101, 4, 6, 365, 1, 12, 18],
 [31, 212, 139, 89],
 [34, 29, 16, 8, 11, 176, 131]]

In [17]:
def padding_doc(encoded_doc,max_len):
    return (pad_sequences(encoded_doc,maxlen=max_len,padding="post"))

In [18]:
padded_doc=padding_doc(encoded_doc,max_len)

In [19]:
padded_doc[:5]

array([[ 40,  20,   2, 290,  17,   5, 364,  21,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  1,  35,   8,  10,  44,  21,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [ 17, 101,   4,   6, 365,   1,  12,  18,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [ 31, 212, 139,  89,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [ 34,  29,  16,   8,  11, 176, 131,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]], dtype=int32)

In [20]:
output_tokenizer = creat_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [21]:
output_tokenizer.word_index

{'atis_flight_time': 1,
 'atis_aircraft': 2,
 'atis_abbreviation': 3,
 'atis_airfare': 4,
 'atis_ground_service': 5,
 'atis_flight': 6,
 'atis_airline': 7,
 'atis_quantity': 8}

In [22]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [23]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [24]:
encoded_output.shape

(4834, 1)

In [25]:
def one_hot(encode):
    o = OneHotEncoder(sparse = False)
    return(o.fit_transform(encode))

In [26]:
output_one_hot=one_hot(encoded_output)

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [29]:
print("shape of train_X = %s and train_Y = %s" % (train_X.shape,train_Y.shape))
print("shape of val_X=%s and val_Y=%s"%(val_X.shape,val_Y.shape))

shape of train_X = (3867, 30) and train_Y = (3867, 8)
shape of val_X=(967, 30) and val_Y=(967, 8)


In [27]:
def create_model(vocab_size, max_length):
      model = Sequential()
      model.add(Embedding(vocab_size,128, input_length = max_length, trainable = False))
      model.add(SpatialDropout1D(0.5))
      model.add(Conv1D(filters=32, kernel_size=8,kernel_regularizer=regularizers.l2(0.00001), padding='same'))
      model.add(LeakyReLU(alpha=0.2))
      model.add(MaxPooling1D(pool_size=2))
      model.add(Bidirectional(LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True)))
      model.add(SpatialDropout1D(0.5))
      model.add(Conv1D(filters=32, kernel_size=8,kernel_regularizer=regularizers.l2(0.00001), padding='same'))
      model.add(LeakyReLU(alpha=0.2))
      model.add(MaxPooling1D(pool_size=2))
      model.add(Bidirectional(LSTM(128,dropout=0.5, recurrent_dropout=0.5)))
      model.add(Dense(8,activation='softmax'))
      

      return model

In [28]:
model = create_model(vocab_size, max_len)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 128)           80640     
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 30, 128)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 30, 32)            32800     
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 30, 32)            0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 15, 32)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 15, 256)           164864    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 15, 256)           0

In [29]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.93633, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 0.93633 to 0.73015, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 0.73015 to 0.58720, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 0.58720 to 0.57440, saving model to model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 0.57440 to 0.47804, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 0.47804 to 0.44151, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 0.44151 to 0.38544, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 0.38544 to 0.35903, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss improved from 0.35903 to 0.35671, saving model to model.h5
Epoch 10/100

Epoch 00010: val_loss improved from 0.35671 to 0.34855, saving model to model.h5
Epoch 11/100

Epoch 00011: val_loss improved from 0.34855 to 0.

In [34]:
model=load_model("model.h5")

In [35]:
def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tockenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_len)
  
  pred = model.predict_proba(x)
  
  
  return pred

In [36]:
def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))

In [40]:
Text="what flights travel from las vegas to los angeles "
pred=predictions(Text)
get_final_output(pred,unique_intent)

['what', 'flights', 'travel', 'from', 'las', 'vegas', 'to', 'los', 'angeles']
atis_flight has confidence = 0.9436016
atis_abbreviation has confidence = 0.033948895
atis_ground_service has confidence = 0.007946702
atis_airline has confidence = 0.005920276
atis_airfare has confidence = 0.005878211
atis_aircraft has confidence = 0.0023915332
atis_flight_time has confidence = 0.00029803955
atis_quantity has confidence = 1.474762e-05
