In [None]:
!pip install --upgrade gensim
from gensim.models.fasttext import FastText
import numpy as np
%tensorflow_version 1.x
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.8.3)


In [None]:
import pandas as pd
intents =['update_personal_details','repos_benefits','get_lc_form','sl_development_bond_benefits','life_insurance_limit','marriage_claim','account_currency','foreign_account_lkr_withdrawal','joint_account_details','SLBFE_info','selan_sure_info','FCAISPE_required_docs','resident_foreign_account_info']
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "utf-8",
       names = ["Sentence", "Intent"])
  df = df.loc[df['Intent'].isin(intents)]
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)

In [None]:
intent, unique_intent, sentences = load_dataset("/content/bank.csv")

In [None]:
nltk.download("punkt")
def cleaning(sentences):
  words = [] 
  for s in sentences:
    w = word_tokenize(s)
    words.append([i for i in w])     
  return words  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])  
print(len(sentences))

345
[['செலான்', 'வங்கியில்', 'கணக்கை', 'ஆரம்பிக்க', 'பயன்படுத்தக்கூடிய', 'நாணயங்கள்', 'எவை', '?'], ['செலான்', 'வங்கியில்', 'கணக்கை', 'ஆரம்பிக்க', 'பயன்படுத்தக்கூடிய', 'நாணயங்கள்', '.']]
345


In [None]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [None]:
def max_length(words):
  return(len(max(words, key = len)))

In [None]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 326 and Maximum length = 14


In [None]:
print(cleaned_words[0])

['செலான்', 'வங்கியில்', 'கணக்கை', 'ஆரம்பிக்க', 'பயன்படுத்தக்கூடிய', 'நாணயங்கள்', 'எவை', '?']


In [None]:
embedding_size = 300
window_size = 40
min_word = 5
down_sampling = 1e-2

In [None]:
ft_model = FastText(cleaned_words,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

In [None]:
print(ft_model.wv['2in1'])
word_tokenizer.word_index.items()

In [None]:
word_index =word_tokenizer.word_index

In [None]:
#embedding matrix

print('preparing embedding matrix...')
words_not_found = []
nb_words = 326

embedding_matrix = np.zeros((nb_words, embedding_size ))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = ft_model.wv[word]
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [None]:
print("sample words not found: ", words_not_found)

In [None]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [None]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
encoded_output = encoding_doc(output_tokenizer, intent)
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)
encoded_output.shape


In [None]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [None]:
output_one_hot = one_hot(encoded_output)

In [None]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [None]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [None]:
padded_doc = padding_doc(encoded_doc, max_length)

In [None]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

In [None]:
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras import regularizers
from keras import optimizers

batch_size = 256 
num_epochs = 8 

#model parameters
num_filters = 64 
#embed_dim = 300 
weight_decay = 1e-4

num_classes = 13
print("training CNN ...")
model = Sequential()
model.add(Embedding(nb_words, embedding_size, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Conv1D(num_filters, 7, activation='linear', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='linear', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='linear', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(num_classes, activation='sigmoid'))  #multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()


In [None]:
from keras.callbacks import EarlyStopping
#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [None]:
hist = model.fit(train_X, train_Y, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

In [None]:
print(model.test_on_batch(val_X,val_Y))
model.metrics_names

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
train_X.shape,train_Y.shape


In [None]:
smote = SMOTE('minority')
X_sm, y_sm = smote.fit_sample(train_X,train_Y)

In [None]:
print(X_sm.shape,y_sm.shape)

In [None]:
hist = model.fit(X_sm, y_sm, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

In [None]:
print(model.test_on_batch(val_X,val_Y))
model.metrics_names

In [None]:
from sklearn.utils import class_weight
y_train_labels = np.argmax(train_Y, axis =1)
class_weight = class_weight.compute_class_weight('balanced',np.unique(y_train_labels),y_train_labels)

In [None]:
hist = model.fit(X_sm, y_sm, batch_size=batch_size, epochs=num_epochs, class_weight=class_weight, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

In [None]:
print(model.test_on_batch(val_X,val_Y))
model.metrics_names

In [None]:
hist = model.fit(train_X, train_Y, batch_size=batch_size, epochs=num_epochs, class_weight=class_weight, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

In [None]:
print(model.test_on_batch(val_X,val_Y))
model.metrics_names

In [None]:
model.evaluate(val_X,val_Y)