In [None]:
import numpy as np
%tensorflow_version 1.x
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

TensorFlow is already loaded. Please restart the runtime to change versions.


In [None]:
import pandas as pd
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "utf-8",
       names = ["Sentence", "Intent"])
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)

In [None]:
intent, unique_intent, sentences = load_dataset("/content/bank.csv")

In [None]:
print(intent[:5])

0    2in1_acoount_info
1    2in1_acoount_info
2    2in1_acoount_info
3    2in1_acoount_info
4    2in1_acoount_info
Name: Intent, dtype: object


In [None]:
nltk.download("punkt")
def cleaning(sentences):
  words = [] 
  for s in sentences:
    w = word_tokenize(s)
    words.append([i for i in w])     
  return words  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])  
print(len(sentences))

1567
[['நான்', '2in1', 'கணக்கில்', 'சேமிப்பது', 'எப்படி', '?'], ['2in1', 'கணக்கில்', 'நான்', 'சேமிப்பது', 'எப்படி', '?']]
1567


In [None]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [None]:
def max_length(words):
  return(len(max(words, key = len)))

In [None]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 1141 and Maximum length = 22


In [None]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [None]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [None]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [None]:
padded_doc = padding_doc(encoded_doc, max_length)

In [None]:
padded_doc[:5]

array([[  3,  45,  62, 348,  11,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 45,  62,   3, 348,  11,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 11,   3,  45,  62, 348,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  3,  11,  45,  62, 348,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0],
       [ 45,  62,   3, 587, 287,   2,   1,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [None]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (1567, 22)


In [None]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [None]:
output_tokenizer.word_index

{'2in1_acoount_info': 11,
 '2in1_atm_card': 33,
 '2in1_interest_receiving': 16,
 '2in1_min_balance': 23,
 '2in1_pass_book': 56,
 'account_currency': 24,
 'bank_statement_online': 46,
 'cancel_card': 4,
 'card_automatic_renewal': 47,
 'card_foreign_use': 5,
 'card_machine_repair': 27,
 'card_usage': 51,
 'change_details': 3,
 'cvv_use': 10,
 'debit_card_requirement': 12,
 'fcaispe_required_docs': 26,
 'foreign_account_lkr_withdrawal': 20,
 'foreign_currency_withdrawal': 1,
 'foreign_currency_withdrawal_currency': 25,
 'foreign_deposit_loan': 35,
 'get_lc_form': 31,
 'get_monthly_report': 13,
 'housing_loan_documents': 32,
 'housing_loan_purpose': 54,
 'interest_credit_info': 38,
 'internet_bank_loan_amount': 42,
 'joint_account_details': 6,
 'life_insurance_limit': 29,
 'loan_requirement': 15,
 'marriage_claim': 30,
 'new_card_reader_cost': 50,
 'new_saving_book': 52,
 'nrfc_account_opening': 53,
 'nrfc_info': 55,
 'precashing_foreign_fixed_deposit': 34,
 'repos_benefits': 36,
 'residen

In [None]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [None]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [None]:
encoded_output.shape

(1567, 1)

In [None]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [None]:
output_one_hot = one_hot(encoded_output)

In [None]:
output_one_hot.shape

(1567, 56)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.3,  random_state=1, stratify=y)
train_X, test_X, train_Y, test_Y = train_test_split(train_X, train_Y, shuffle = True, test_size = 0.3, random_state=1, stratify=y)

In [None]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))
print("Shape of test_X = %s and test_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (1002, 22) and train_Y = (1002, 56)
Shape of val_X = (314, 22) and val_Y = (314, 56)
Shape of test_X = (314, 22) and test_Y = (314, 56)


In [None]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 300, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(32)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(56, activation = "softmax"))
  
  return model

In [None]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 22, 300)           342300    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 64)                85248     
_________________________________________________________________
dense_9 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 56)                1848      
Total params: 431,476
Trainable params: 89,176
Non-trainable params: 342,300
_________________________________________________________________


In [None]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Train on 1002 samples, validate on 314 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 3.97945, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 3.97945 to 3.87727, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 3.87727 to 3.77871, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 3.77871 to 3.65853, saving model to model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 3.65853 to 3.53526, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 3.53526 to 3.36978, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 3.36978 to 3.23792, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 3.23792 to 3.11514, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss improved from 3.11514 to 2.93102, saving model to model.h5
Epoch 10/100

Epoch 00010: val_loss improved from 2.93102 to 2.84849, saving model to model.h5
Epoch 11/100

Ep

In [None]:
 model = load_model("model.h5")

In [None]:
def predictions(text):
  
  test_word = word_tokenize(text)
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred

In [None]:
def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))



In [None]:
text = "எனது கடன்அட்டையை நான் எவ்வகையில் ரத்து செய்வது?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['எனது', 'கடன்அட்டையை', 'நான்', 'எவ்வகையில்', 'ரத்து', 'செய்வது', '?']
cancel_card has confidence = 0.41703138
change_details has confidence = 0.28217405
update_personal_details has confidence = 0.238206
treasury_bond_information has confidence = 0.020806456
SLBFE_info has confidence = 0.010435545
2in1_acoount_info has confidence = 0.006394624
NRFC_info has confidence = 0.0053531164
treasury_bond_important_features has confidence = 0.00503632
new_saving_book has confidence = 0.0022377733
2in1_atm_card has confidence = 0.002153035
2in1_interest_receiving has confidence = 0.0015600394
resident_foreign_account_info has confidence = 0.0012103756
treasury_bond_advantage has confidence = 0.0011654552
debit_card_requirement has confidence = 0.00096731103
marriage_claim has confidence = 0.0009551597
card_usage has confidence = 0.0009390282
suspious_activity has confidence = 0.00088435656
bank_statement_online has confidence = 0.0006678072
interest_credit_info has confidence = 0.00027162704
car

In [None]:

print(model.test_on_batch(test_X ,test_Y))
model.metrics_names

[0.4184269, 0.89641434]


['loss', 'acc']