# Intent Classification using Keras

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Leonardo
[nltk_data]     W\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Data Processing - Main
import numpy as np
import pandas as pd
import re

# Data Processing - Tokenizer and Encoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize
from sklearn.preprocessing import OneHotEncoder

# Data Processing - Splitting
from sklearn.model_selection import train_test_split

# Visualization
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

# Machine Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, LSTM, Dropout
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

# Misc.
from datetime import datetime

In [3]:
# test gpu availability
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
!nvidia-smi

Mon Nov 02 18:51:21 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 452.06       Driver Version: 452.06       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 166... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   44C    P8     7W /  N/A |    153MiB /  6144MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|       

In [5]:
RANDOM_SEED = 13517048
np.random.seed(RANDOM_SEED)

## Data Cleaning

In [6]:
data = pd.read_csv("nlu.csv")
data.head(10)

Unnamed: 0,text,intent
0,mau absen 11-14 februari 2020,absence
1,mau absen 11 - 14 februari 2020,absence
2,sick leave besok,absence
3,aku mau absence besok,absence
4,change working hours besok,absence
5,change working hour besok,absence
6,absen besok,absence
7,change working hours,absence
8,sick leave,absence
9,mau absen tanggal 10-11 januari 2012,absence


In [7]:
data['intent'].value_counts()

approval_status                           80
approval                                  57
leave_entry                               54
approval_ask_parameter                    52
absence                                   46
                                          ..
approval_ask_confirmation                  2
reject_notification_ask_reason             2
approve_overtime_notification_ask_paid     2
leave_entry_half_day_ask_start_time        1
default_fallback_intent                    1
Name: intent, Length: 103, dtype: int64

### USE ONLY SOME DATA INTENTS

<b>Intents to use:</b>
- absence
- thankyou
- cancel
- help

In [8]:
data = data[data['intent'].isin(['absence', 'thank_you', 'cancel', 'help', 'default_fallback_intent'])]
data.head()

Unnamed: 0,text,intent
0,mau absen 11-14 februari 2020,absence
1,mau absen 11 - 14 februari 2020,absence
2,sick leave besok,absence
3,aku mau absence besok,absence
4,change working hours besok,absence


In [9]:
data['intent'].value_counts()

absence                    46
cancel                     33
thank_you                  22
help                       20
default_fallback_intent     1
Name: intent, dtype: int64

In [10]:
# Clean data:
#   - Strip data from special characters
#   - Tokenize words
#   - Lowercase all word
def clean_data(text_data):
  words = []
  for sentence in text_data:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", sentence)
    tokenized_words = word_tokenize(clean)

    words.append([word.lower() for word in tokenized_words])

  return words

In [11]:
# Get max length of every word in words
def get_max_length(words):
  return len(max(words, key = len))

In [12]:
filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'

def create_tokenizer(words):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [13]:
def create_train_data(text_data):
  # Encoding document
  token = create_tokenizer(text_data)
  sequences = token.texts_to_sequences(text_data)

  max_length = get_max_length(sequences)
  return pad_sequences(sequences, maxlen=max_length, padding="post")

In [14]:
def onehot_encode(data):
  encoder = OneHotEncoder(sparse=False)
  return encoder.fit_transform(data)

In [15]:
cleaned_data = clean_data(data['text'])

In [16]:
tokenizer = create_tokenizer(cleaned_data)
MAX_LENGTH = get_max_length(cleaned_data)
VOCAB_SIZE = len(tokenizer.word_index) + 1

In [17]:
train_data = create_train_data(cleaned_data)
test_data = onehot_encode(data['intent'].values.reshape(-1, 1))

x_train, x_val, y_train, y_val = train_test_split(train_data, test_data, test_size = 0.2)

## Machine Learning Modeling

In [31]:
model = Sequential()

model.add(Embedding(VOCAB_SIZE, 512, input_length=MAX_LENGTH, trainable=False))
model.add(Bidirectional(LSTM(512)))

model.add(Dense(64, activation = "relu"))
model.add(Dropout(0.3))

model.add(Dense(32, activation = "relu"))
model.add(Dropout(0.3))

model.add(Dense(y_train.shape[1], activation = "softmax"))

In [32]:
model.compile(
    optimizer = 'adam',
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 512)           61440     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1024)              4198400   
_________________________________________________________________
dense_3 (Dense)              (None, 64)                65600     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 5)                

### Model Train

In [33]:
# Fit constants

EPOCHS = [100, 10]
BASIZE = [32, 8]

In [None]:
for ep, bs in zip(EPOCHS, BASIZE):
  history = model.fit(x_train, y_train, epochs=ep, batch_size=bs, validation_data=(x_val, y_val))
  plt.plot(history.history['accuracy'])
  plt.plot(history.history['val_accuracy'])

  plt.legend(['accuracy', 'val_accuracy'])
  plt.show()

Epoch 1/100


### Model Predict

In [22]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [23]:
intent_map = {}
for intent, onehot_intent in zip(data['intent'].values, test_data):
  if (intent not in intent_map):
    intent_map[intent] = list(onehot_intent).index(1)

In [24]:
intent_map

{'absence': 0,
 'cancel': 1,
 'help': 3,
 'default_fallback_intent': 2,
 'thank_you': 4}

In [25]:
def predict(text):
  # Clean the text first
  cleaned_text = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(cleaned_text)
  test_word = [w.lower() for w in test_word]
  test_ls = tokenizer.texts_to_sequences(test_word)

  # Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
  x = padding_doc(test_ls, MAX_LENGTH)
  prediction_result = model.predict(x)
  prediction_result = prediction_result[0]

  # convert prediction result to intent word
  unique_intents = intent_map.keys()

  # sort value by intent ranking
  intent_ranking = {}
  for each_intent in unique_intents:
    intent_ranking[each_intent] = prediction_result[intent_map[each_intent]]

  sorted_intent_ranking = sorted(intent_ranking.items(), key=lambda kv: kv[1], reverse=True)
  print(dict(sorted_intent_ranking))
  print("Intent Ranking:")
  for each_intent in sorted_intent_ranking:
    print(each_intent[0] + "= " + str(each_intent[1]) + " confidence")

  return sorted_intent_ranking

In [26]:
predict("gajadi deh, mau makan")

{'cancel': 0.9450395, 'absence': 0.054714642, 'thank_you': 0.00015589596, 'help': 8.0344544e-05, 'default_fallback_intent': 9.680959e-06}
Intent Ranking:
cancel= 0.9450395 confidence
absence= 0.054714642 confidence
thank_you= 0.00015589596 confidence
help= 8.0344544e-05 confidence
default_fallback_intent= 9.680959e-06 confidence


[('cancel', 0.9450395),
 ('absence', 0.054714642),
 ('thank_you', 0.00015589596),
 ('help', 8.0344544e-05),
 ('default_fallback_intent', 9.680959e-06)]

## Save ML Model

To be called in django intent app

In [27]:
model.save("saved_model/intent_model_best")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: saved_model/intent_model_best\assets
