In [231]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [232]:
def load_doc(jsonFile):
    with open(jsonFile) as file:
        Json_data = json.loads(file.read())
    return Json_data

In [233]:
data = load_doc('data/responses.json')

In [234]:
def frame_data(feat_1,feat_2,is_pattern):
    is_pattern = is_pattern
    df = pd.DataFrame(columns=[feat_1,feat_2])
    for intent in data['intents']:
        if is_pattern:
            for pattern in intent['patterns']:
                w = pattern
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
        else:
            for response in intent['responses']:
                w = response
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
    return df

In [235]:
# users intents
df1 = frame_data('questions','labels',True)
df1

Unnamed: 0,questions,labels
0,Hi,greeting
1,How are you,greeting
2,Is anyone there?,greeting
3,Hello,greeting
4,Good day,greeting
5,Bye,goodbye
6,See you later,goodbye
7,Goodbye,goodbye


In [236]:
df1.labels.value_counts(sort=False)

goodbye     3
greeting    5
Name: labels, dtype: int64

In [237]:
# Bot response
df2 = frame_data('response','labels',False)
df2.head()

Unnamed: 0,response,labels
0,"Hello, thanks for visiting",greeting
1,Good to see you again,greeting
2,"Hi there, how can I help?",greeting


In [238]:
lemmatizer = WordNetLemmatizer()

vocab = Counter()
labels = []
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

In [239]:
def remove_stop_words(tokenizer,df,feature):
    doc_without_stopwords = []
    for entry in df[feature]:
        tokens = tokenizer(entry)
        joblib.dump(tokens,'tokens/tokens.pkl')
        doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return

In [240]:
def create_vocab(tokenizer,df,feature):
    for entry in df[feature]:
        tokens = tokenizer(entry)
        vocab.update(tokens)
    joblib.dump(vocab,'tokens/vocab.pkl')
    return

In [241]:
create_vocab(tokenizer,df1,'questions')
remove_stop_words(tokenizer,df1,'questions')

In [242]:
print(vocab.most_common(20))

[('you', 2), ('hi', 1), ('how', 1), ('are', 1), ('is', 1), ('anyone', 1), ('there', 1), ('hello', 1), ('good', 1), ('day', 1), ('bye', 1), ('see', 1), ('later', 1), ('goodbye', 1)]


In [243]:
vocab_size = len(vocab)
vocab_size

14

In [244]:
df1

Unnamed: 0,questions,labels
0,hi,greeting
1,how are you,greeting
2,is anyone there,greeting
3,hello,greeting
4,good day,greeting
5,bye,goodbye
6,see you later,goodbye
7,goodbye,goodbye


In [245]:
test_list = list(df1.groupby(by='labels',as_index=False).first()['questions'])
test_list

['bye', 'hi']

In [246]:
test_index = []
for i,_ in enumerate(test_list):
    idx = df1[df1.questions == test_list[i]].index[0]
    test_index.append(idx)
test_index

[5, 0]

In [247]:
train_index = [i for i in df1.index if i not in test_index]

In [248]:
' '.join(list(vocab.keys()))

'hi how are you is anyone there hello good day bye see later goodbye'

In [249]:
def encoder(df,feature):
#     text = ' '.join(list(vocab.keys()))
    t = Tokenizer()
    entries = [entry for entry in df[feature]]
    t.fit_on_texts(entries)
    joblib.dump(t,'tokens/tokenizer_t.pkl')
    vocab_size = len(t.word_index) + 1
    entries = [entry for entry in df[feature]]
    max_length = max([len(s.split()) for s in entries])
    encoded = t.texts_to_sequences(entries)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded, vocab_size

In [250]:
X,vocab_size = encoder(df1,'questions')

In [251]:
df_encoded = pd.DataFrame(X)

In [252]:
df_encoded['labels'] = df1.labels
df_encoded.head()

Unnamed: 0,0,1,2,labels
0,2,0,0,greeting
1,3,4,1,greeting
2,5,6,7,greeting
3,8,0,0,greeting
4,9,10,0,greeting


In [253]:
for i in range(0,2):
    dt = [0]*16
    dt.append('confused')
    dt = [dt]
    pd.DataFrame(dt).rename(columns = {16:'labels'})
    df_encoded = df_encoded.append(pd.DataFrame(dt).rename(columns = {16:'labels'}),ignore_index=True)

In [254]:
df_encoded.tail()

Unnamed: 0,0,1,2,labels,3,4,5,6,7,8,9,10,11,12,13,14,15
5,11,0,0,goodbye,,,,,,,,,,,,,
6,12,1,13,goodbye,,,,,,,,,,,,,
7,14,0,0,goodbye,,,,,,,,,,,,,
8,0,0,0,confused,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0,confused,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [255]:
train_index.append(87)

In [256]:
test_index.append(88)

In [257]:
from sklearn.preprocessing import LabelEncoder
lable_enc = LabelEncoder()

In [258]:
labl = lable_enc.fit_transform(df_encoded.labels)
labl

array([2, 2, 2, 2, 2, 1, 1, 1, 0, 0])

In [259]:
mapper = {}
for index,key in enumerate(df_encoded.labels):
    if key not in mapper.keys():
        mapper[key] = labl[index]
mapper

{'greeting': 2, 'goodbye': 1, 'confused': 0}

In [260]:
df2.head()

Unnamed: 0,response,labels
0,"Hello, thanks for visiting",greeting
1,Good to see you again,greeting
2,"Hi there, how can I help?",greeting


In [261]:
df2.labels = df2.labels.map(mapper).astype({'labels': 'int32'})
df2.head()

Unnamed: 0,response,labels
0,"Hello, thanks for visiting",2
1,Good to see you again,2
2,"Hi there, how can I help?",2


In [262]:
df2.to_csv('response.csv',index=False)

In [263]:
train = df_encoded.loc[train_index]
test = df_encoded.loc[test_index]

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Int64Index([87], dtype='int64'). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

In [None]:
X_train = train.drop(columns=['labels'],axis=1)
y_train = train.labels
X_test = test.drop(columns=['labels'],axis=1)
y_test = test.labels

In [None]:
y_train =pd.get_dummies(y_train).values
y_test =pd.get_dummies(y_test).values

In [None]:
y_train[0].shape,y_test[0].shape

In [None]:
X_train.shape

In [None]:
max_length = X_train.shape[1]
# output = len(df3.labels.unique())
output = 17

In [None]:
max_length

In [None]:
early_stopping = EarlyStopping(monitor='val_loss',patience=10)
checkpoint = ModelCheckpoint("model-v1.h5",
                             monitor="val_loss",
                             mode="min",
                             save_best_only = True,
                             verbose=1)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, verbose = 1, min_delta = 0.0001)
callbacks = [early_stopping,checkpoint,reduce_lr]



In [None]:

def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size,300, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
    model.add(MaxPooling1D(pool_size=8))
    model.add(Flatten())
#     model.add(Dense(32, activation='relu'))
    model.add(Dense(17, activation='softmax'))


    # compile network
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss = 'categorical_crossentropy',
              # optimizer = Adam(lr=0.001),
              optimizer = 'adam',
              metrics = ['accuracy'])

    # summarize defined model
    model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [None]:
# define model
model = define_model(vocab_size, max_length)

In [None]:
history = model.fit(X_train, y_train, epochs=500, verbose=1,validation_data=(X_test,y_test),callbacks=callbacks)

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss=history.history['loss']
val_loss=history.history['val_loss']

plt.figure(figsize=(16,8))
plt.subplot(1, 2, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
[np.argmax(i) for i in model.predict(X_test)][:10]

In [None]:
[np.argmax(i) for i in y_test][:10]

In [None]:
def get_text():
    input_text  = ['what are you']
    df_input = pd.DataFrame(input_text,columns=['questions'])
    df_input
    return df_input

In [None]:
#load artifacts
from tensorflow.keras.models import load_model
model = load_model('model-v1.h5')
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

In [None]:
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

In [None]:
def remove_stop_words_for_input(tokenizer,df,feature):
    doc_without_stopwords = []
    entry = df[feature][0]
    tokens = tokenizer(entry)
    doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return df

In [None]:
def encode_input_text(tokenizer_t,df,feature):
    t = tokenizer_t
    entry = entry = [df[feature][0]]
    encoded = t.texts_to_sequences(entry)
    padded = pad_sequences(encoded, maxlen=16, padding='post')
    return padded

In [None]:
def get_pred(model,encoded_input):
    pred = np.argmax(model.predict(encoded_input))
    return pred

In [None]:
def bot_precausion(df_input,pred):
    words = df_input.questions[0].split()
    if len([w for w in words if w in vocab])==0 :
        pred = 1
    return pred

In [None]:
def get_response(df2,pred):
    upper_bound = df2.groupby('labels').get_group(pred).shape[0]
    r = np.random.randint(0,upper_bound)
    responses = list(df2.groupby('labels').get_group(pred).response)
    return responses[r]


In [None]:
def bot_response(response,):
    print(response)

In [None]:
df_input = get_text()

#load artifacts
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

df_input = remove_stop_words_for_input(tokenizer,df_input,'questions')
encoded_input = encode_input_text(tokenizer_t,df_input,'questions')

pred = get_pred(model,encoded_input)
pred = bot_precausion(df_input,pred)

response = get_response(df2,pred)
bot_response(response)

