In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from sklearn.preprocessing import OneHotEncoder

In [5]:
def load_dataset(filename):
    df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
    print(df.head())
    intent = df["Intent"]
    unique_intent = list(set(intent))
    sentences = list(df["Sentence"])
    return (intent, unique_intent, sentences)

In [6]:
intent, unique_intent, sentences = load_dataset("cb_dataset_cleaned.csv")


                         Sentence  Intent
0  Is there a bot chatting to me?  GQ.bot
1        Is it automated message?  GQ.bot
2             Computer based pely  GQ.bot
3                   Bot or human?  GQ.bot
4        Bot is chatting with me?  GQ.bot


In [44]:
print(sentences[100:105])
print(unique_intent)

["What's the club about?", 'What does IEEE-VIT does', 'What is IEEE-VIT?', 'What is ieee', 'What is IEEE vit ?']
['FAQ.why_reg', 'SQ.IEEE', 'SQ.reg_fee', 'GQ.name', 'FAQ.food', 'GQ.query', 'GQ.bot', 'SQ.event_details', 'GQ.gen', 'GQ.help', 'SQ.event_prize', 'FAQ.contact_info', 'JOIN.speaker', 'JOIN.sponsor', 'SQ.event_speakers', 'SQ.event_date', 'SQ.reg_lastdate', 'SQ.event_schedule', 'FAQ.accom']


In [8]:
stemmer = LancasterStemmer()  #using lancaster stemmer

In [9]:
def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        #stemming
        words.append([i.lower() for i in w])
    
    return words  

In [10]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))

518


In [11]:
print(cleaned_words[115:118])

[['what', 'is', 'the', 'registration', 'fee', 'of', 'the', 'event'], ['what', 'is', 'the', 'fees', 'required', 'to', 'register', 'for', 'the', 'event'], ['what', 's', 'the', 'price', 'for', 'getting', 'a', 'registration', 'done', 'in', 'the', 'event']]


In [12]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    token = Tokenizer(filters = filters)
    token.fit_on_texts(words)
    return token

In [13]:
def max_length(words):
    return(len(max(words, key = len)))

In [14]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 418 and Maximum length = 16


In [15]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [16]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [17]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [18]:
padded_doc = padding_doc(encoded_doc, max_length)

In [19]:
padded_doc[115:118]

array([[  2,   3,   1,  29,  93,  23,   1,   5,   0,   0,   0,   0,   0,
          0,   0,   0],
       [  2,   3,   1,  78, 163,   7,  22,   6,   1,   5,   0,   0,   0,
          0,   0,   0],
       [  2,  34,   1, 164,   6, 165,  11,  29, 131,  30,   1,   5,   0,
          0,   0,   0]])

In [20]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (518, 16)


In [42]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [43]:
output_tokenizer.word_index

{'faq.why_reg': 1,
 'sq.ieee': 2,
 'sq.reg_fee': 3,
 'gq.name': 4,
 'faq.food': 5,
 'gq.query': 6,
 'gq.bot': 7,
 'sq.event_details': 8,
 'gq.gen': 9,
 'gq.help': 10,
 'sq.event_prize': 11,
 'faq.contact_info': 12,
 'join.speaker': 13,
 'join.sponsor': 14,
 'sq.event_speakers': 15,
 'sq.event_date': 16,
 'sq.reg_lastdate': 17,
 'sq.event_schedule': 18,
 'faq.accom': 19}

In [23]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [24]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [25]:
encoded_output.shape

(518, 1)

In [26]:
def one_hot(encode):
    o = OneHotEncoder(sparse = False)
    return(o.fit_transform(encode))

In [27]:
output_one_hot = one_hot(encoded_output)

In [28]:
output_one_hot.shape

(518, 19)

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
x_train, x_val, y_train, y_val = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)
#x_train, x_val, y_train, y_val = train_test_split(padded_doc, output_one_hot, test_size = 0.2, random_state=69)

In [81]:
print("Shape of train_X = %s and train_Y = %s" % (x_train.shape, y_train.shape))
print("Shape of val_X = %s and val_Y = %s" % (x_val.shape, y_val.shape))

Shape of train_X = (414, 16) and train_Y = (414, 19)
Shape of val_X = (104, 16) and val_Y = (104, 19)


In [91]:
model=Sequential()
model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(32, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(19, activation = "softmax"))

In [92]:
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])


In [93]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history = model.fit(x_train, y_train, epochs = 130, batch_size = 32, validation_split=0.25, verbose=1, callbacks=[checkpoint])
#history = model.fit(x_train, y_train, epochs = 100, batch_size = 32, validation_data= (x_val, y_val), verbose=1)

Train on 310 samples, validate on 104 samples
Epoch 1/130
Epoch 00001: val_loss improved from inf to 2.95640, saving model to model.h5
Epoch 2/130
Epoch 00002: val_loss improved from 2.95640 to 2.94393, saving model to model.h5
Epoch 3/130
Epoch 00003: val_loss improved from 2.94393 to 2.93746, saving model to model.h5
Epoch 4/130
Epoch 00004: val_loss improved from 2.93746 to 2.92004, saving model to model.h5
Epoch 5/130
Epoch 00005: val_loss improved from 2.92004 to 2.89271, saving model to model.h5
Epoch 6/130
Epoch 00006: val_loss improved from 2.89271 to 2.84921, saving model to model.h5
Epoch 7/130
Epoch 00007: val_loss improved from 2.84921 to 2.70517, saving model to model.h5
Epoch 8/130
Epoch 00008: val_loss improved from 2.70517 to 2.45589, saving model to model.h5
Epoch 9/130
Epoch 00009: val_loss improved from 2.45589 to 2.32765, saving model to model.h5
Epoch 10/130
Epoch 00010: val_loss improved from 2.32765 to 2.06433, saving model to model.h5
Epoch 11/130
Epoch 00011: v

In [94]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 16, 128)           53504     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_2 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 19)                627       
Total params: 325,523
Trainable params: 325,523
Non-trainable params: 0
_________________________________________________________________


In [95]:
_,acc=model.evaluate(x_val,y_val)
print(acc*100)

76.92307829856873


In [96]:
model = load_model("model.h5")

In [106]:
def predictions(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = word_tokenizer.texts_to_sequences(test_word)
    #print(test_word)            ##
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
    
    test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
    x = padding_doc(test_ls, max_length)
  
    pred = model.predict_proba(x)
  
    return pred 

In [86]:
'''''def predictions1(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = word_tokenizer.texts_to_sequences(test_word)
  
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
    
    test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
    x = padding_doc(test_ls, max_length)
  
    pred = model.predict_classes(x)
    return pred

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-86-ec8984824710>, line 16)

In [98]:
''''def get_final_output(pred, classes):
    predictions = pred[0]
 
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)
    pred_intent=classes[0]
 
    for i in range(pred.shape[1]):
        print("%s has confidence = %s" % (classes[i], (predictions[i])))
    return pred_intent

In [107]:
def get_final_output(pred, classes):
    predictions = pred[0]
 
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)
    pred_intent=classes[0]
 
    #for i in range(pred.shape[1]):
        #print("%s has confidence = %s" % (classes[i], (predictions[i])))
    return pred_intent


In [37]:
'''def word_for_id(integer, tokenizer):
    for word, index in Tokenizer.word_index.items():
        if index >=0:
            return word
    return word

In [110]:
'''
text="need some help?"
pred = predictions(text)

#predictions(text)
get_final_output(pred, unique_intent)
#word = word_for_id(pred, output_tokenizer)

'''

'\ntext="need some help?"\npred = predictions(text)\n\n#predictions(text)\nget_final_output(pred, unique_intent)\n#word = word_for_id(pred, output_tokenizer)\n\n'

In [200]:
A_GQ_help = ["Yes, I can help you with anything.", "What do you need help with?", "Sure. What can I help you with?"]
A_FAQ_why_reg = ["Because there's no other event like this", "you will learn many new things", 
             " No other event which has both conference and hackathon"]
A_SQ_IEEE = ["IEEE-VIT is one of the most active and prestigious chapters of VIT", "IEEE-VIT is a student technical chapter"
             "IEEE-VIT is a student based chapter which falls under region 10"]
A_SQ_reg_fee = ["No, the event is free to attend", "There is no registration fee for the event", "No, the registrations are free"]
A_GQ_name = ["My name is IEEE bot", " I am IEEE bot", "You are talking to IEEE bot"]
A_FAQ_food = ["Yes, food will be provided","Yes, refreshments will be provided","Sure, everyone needs food"]
A_GQ_query =[ "Yes, go ahead","Ask me any queries you have","go ahead, ask away"]
A_GQ_bot = ["That's right, I am a chatbot", "Yes, I am a chatbot", "I am a bot. Chatbot"]
A_SQ_event_details = ["This event is all about learning", "It's a hackathon","It's a fun event for sure"]
A_GQ_gen = ["I am good","I am doing great","Never better"]
A_SQ_event_prize = ["Yes, there will be prizes","Definitely","Yes there will be, along with goodies"]
A_FAQ_contact_info = ["Please contact us via insta","We are reachable from our insta handle","You can contact us anytime via insta"]
A_JOIN = ["Please contact us via collaborations.ieeevit@gmail.com, thank you."]
A_SQ_event_speakers=["speaker 1 and 2 will join us, stay tuned for more"," We have confirmed speaker 1 and 2"]
A_SQ_event_date=["From 10th to 12th","It will be for 2 days starting from 10th","10th-12th"]
A_SQ_reg_lastdate=["The last day to register is 9th","You can register by 9th","9th is the last day to register"]
A_SQ_event_schedule=["There will be talks followed by a hackthon","First talks then hack","Hack after speech"]
A_FAQ_accom=["No, accomodation can't be provided","Sorry, we dont provide accomodation","There is no accomodation facility from our side"]



In [201]:
import random
def F1():
    print('bot: ',random.choice(A_GQ_help))
def F2():
    print('bot: ',random.choice(A_FAQ_why_reg))
def F3():
    print('bot: ',random.choice(A_SQ_IEEE))
def F4():
    print('bot: ',random.choice(A_SQ_reg_fee))
def F5():
    print('bot: ',random.choice(A_GQ_name))
def F6():
    print('bot: ',random.choice(A_FAQ_food))
def F7():
    print('bot: ',random.choice(A_GQ_query))
def F8():
    print('bot: ',random.choice(A_GQ_bot))
def F9():
    print('bot: ',random.choice(A_SQ_event_details))
def F10():
    print('bot: ',random.choice(A_GQ_gen))
def F11():
    print('bot: ',random.choice(A_SQ_event_prize))
def F12():
    print('bot: ',random.choice(A_FAQ_contact_info))
def F13():
    print('bot: ',random.choice(A_JOIN))
def F14():
    print('bot: ',random.choice(A_SQ_event_speakers))
def F15():
    print('bot: ',random.choice(A_SQ_event_date))
def F16():
    print('bot: ',random.choice(A_SQ_reg_lastdate))
def F17():
    print('bot: ',random.choice(A_SQ_event_schedule))
def F18():
    print('bot: ',random.choice(A_FAQ_accom))

In [202]:
def user_output():
    for i in range(100):
        text=input('\nYou: ')
        if text == 'quit':
            print('bye')
            break
        else : 
            pred = predictions(text)
            if get_final_output(pred, unique_intent) == 'GQ.help':
                F1()
            elif get_final_output(pred, unique_intent) == 'FAQ.why_reg':
                F2()
            elif get_final_output(pred, unique_intent) == 'SQ.IEEE':
                F3()
            elif get_final_output(pred, unique_intent) == 'SQ.reg_fee':
                F4()
            elif get_final_output(pred, unique_intent) == 'GQ.name':
                F5()
            elif get_final_output(pred, unique_intent) == 'FAQ.food':
                F6()
            elif get_final_output(pred, unique_intent) == 'GQ.query':
                F7()
            elif get_final_output(pred, unique_intent) == 'GQ.bot':
                F8()
            elif get_final_output(pred, unique_intent) == 'SQ.event_details':
                F9()
            elif get_final_output(pred, unique_intent) == 'GQ.gen':
                F10()
            elif get_final_output(pred, unique_intent) == 'SQ.event_prize':
                F11()
            elif get_final_output(pred, unique_intent) == 'FAQ.contact_info':
                F12()
            elif get_final_output(pred, unique_intent) == 'JOIN.speaker':
                F13()
            elif get_final_output(pred, unique_intent) == 'JOIN.sponsor':
                F13()
            elif get_final_output(pred, unique_intent) == 'SQ.event_speakers':
                F14()
            elif get_final_output(pred, unique_intent) == 'SQ.event_date':
                F15()
            elif get_final_output(pred, unique_intent) == 'SQ.reg_lastdate':
                F16()
            elif get_final_output(pred, unique_intent) == 'SQ.event_schedule':
                F17()
            elif get_final_output(pred, unique_intent) == 'FAQ.accom':
                F18()
            else:
                print("Please enter a valid response")



In [204]:
user_output()


You: what is your name?
bot:  My name is IEEE bot

You: nice to meet you
bot:  Never better

You: can you help me?
bot:  Yes, I can help you with anything.

You: will we get food?
bot:  Sure, everyone needs food

You: I want to be a sponsor
bot:  Please contact us via collaborations.ieeevit@gmail.com, thank you.

You: quit
bye
