# Text classification for chat bot API.

Create a Chatbot API for leave enquiry system using  Decision Tree Classifier and Navie Bayes Classifier 

## Import useful libraries

In [1]:
import nltk
import re
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
## Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [3]:
## Display all rows and columns of a dataframe instead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Preprocess

#### Preprocess the sentence to convert it into lower case, tokenize and remove stop words

In [4]:
def preprocess(sentence):
    sentence = sentence.lower() #Convert the sentences into lowercase
    tokenizer = RegexpTokenizer(r'\w+') #Tokenize on word charcter
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')] # remove stopwords
    return filtered_words

In [5]:
sentence = "The Big brown fox jumped over a lazy dog."
sentence2 = "This is particularly important in today's world where we are swamped with unstructured natural language data on the variety of social media platforms people engage in now-a-days (note -  now-a-days in the decade of 2010-2020)"

In [6]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

['big', 'brown', 'fox', 'jumped', 'lazy', 'dog']


In [7]:
preprocess(sentence2)

['particularly',
 'important',
 'today',
 'world',
 'swamped',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade',
 '2010',
 '2020']

## Tagging

#### Assign part of speech(pos) to words.

In [8]:
tags = nltk.pos_tag(preprocessed_sentence)
print(tags)

[('big', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('lazy', 'JJ'), ('dog', 'NN')]


In [9]:
tags2 = nltk.pos_tag(preprocess(sentence2))
print(tags2)

[('particularly', 'RB'), ('important', 'JJ'), ('today', 'NN'), ('world', 'NN'), ('swamped', 'VBD'), ('unstructured', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('variety', 'NN'), ('social', 'JJ'), ('media', 'NNS'), ('platforms', 'NNS'), ('people', 'NNS'), ('engage', 'VBP'), ('days', 'NNS'), ('note', 'VBP'), ('days', 'NNS'), ('decade', 'NN'), ('2010', 'CD'), ('2020', 'CD')]


## Extract only noun and verb

In [11]:
def extract_tagged(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag=='NN' or tag == 'VBN' or tag == 'NNS' or tag == 'VBP' or tag == 'RB' or tag == 'VBZ' or tag == 'VBG' or tag =='PRP' or tag == 'JJ':
            features.append(word)
    return features

In [12]:
extract_tagged(tags2)

['particularly',
 'important',
 'today',
 'world',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade']

## Lemmatize and Stem words in sentence

In [13]:
stemmer = SnowballStemmer("english")
lmtzr = WordNetLemmatizer()
def extract_feature(text):
    words = preprocess(text)
    print('words: ',words)
    tags = nltk.pos_tag(words)
    print('tags: ',tags)
    extracted_features = extract_tagged(tags)
    print('Extracted features: ',extracted_features)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
    print(stemmed_words)
    result = [lmtzr.lemmatize(x) for x in stemmed_words]
   
    return result

In [14]:
sentence

'The Big brown fox jumped over a lazy dog.'

In [15]:
words = extract_feature(sentence)
print(words)

words:  ['big', 'brown', 'fox', 'jumped', 'lazy', 'dog']
tags:  [('big', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('lazy', 'JJ'), ('dog', 'NN')]
Extracted features:  ['big', 'brown', 'fox', 'lazy', 'dog']
['big', 'brown', 'fox', 'lazi', 'dog']
['big', 'brown', 'fox', 'lazi', 'dog']


In [16]:
words = extract_feature(sentence2)
print(words)

words:  ['particularly', 'important', 'today', 'world', 'swamped', 'unstructured', 'natural', 'language', 'data', 'variety', 'social', 'media', 'platforms', 'people', 'engage', 'days', 'note', 'days', 'decade', '2010', '2020']
tags:  [('particularly', 'RB'), ('important', 'JJ'), ('today', 'NN'), ('world', 'NN'), ('swamped', 'VBD'), ('unstructured', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('variety', 'NN'), ('social', 'JJ'), ('media', 'NNS'), ('platforms', 'NNS'), ('people', 'NNS'), ('engage', 'VBP'), ('days', 'NNS'), ('note', 'VBP'), ('days', 'NNS'), ('decade', 'NN'), ('2010', 'CD'), ('2020', 'CD')]
Extracted features:  ['particularly', 'important', 'today', 'world', 'unstructured', 'natural', 'language', 'data', 'variety', 'social', 'media', 'platforms', 'people', 'engage', 'days', 'note', 'days', 'decade']
['particular', 'import', 'today', 'world', 'unstructur', 'natur', 'languag', 'data', 'varieti', 'social', 'media', 'platform', 'peopl', 'engag', 'day', 'note

In [17]:
extract_feature("He hurt his right foot while he was wearing white shoes on his feet")

words:  ['hurt', 'right', 'foot', 'wearing', 'white', 'shoes', 'feet']
tags:  [('hurt', 'NN'), ('right', 'JJ'), ('foot', 'NN'), ('wearing', 'VBG'), ('white', 'JJ'), ('shoes', 'NNS'), ('feet', 'NNS')]
Extracted features:  ['hurt', 'right', 'foot', 'wearing', 'white', 'shoes', 'feet']
['hurt', 'right', 'foot', 'wear', 'white', 'shoe', 'feet']


['hurt', 'right', 'foot', 'wear', 'white', 'shoe', 'foot']

## Implementing bag of words

In simple terms, itâ€™s a collection of words to represent a sentence, disregarding the order in which they appear.

In [18]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [19]:
word_feats(words)

{'particular': True,
 'import': True,
 'today': True,
 'world': True,
 'unstructur': True,
 'natur': True,
 'languag': True,
 'data': True,
 'varieti': True,
 'social': True,
 'medium': True,
 'platform': True,
 'peopl': True,
 'engag': True,
 'day': True,
 'note': True,
 'decad': True}

## Parsing the whole document

In [20]:
def extract_feature_from_doc(data):
    result = []
    corpus = []
    # The responses of the chat bot
    answers = {}
    for (text,category,answer) in data:

        features = extract_feature(text)

        corpus.append(features)
        result.append((word_feats(features), category))
        answers[category] = answer

    return (result, sum(corpus,[]), answers)

In [21]:
extract_feature_from_doc([['this is the input text from the user','category','answer to give']])

words:  ['input', 'text', 'user']
tags:  [('input', 'NN'), ('text', 'IN'), ('user', 'NN')]
Extracted features:  ['input', 'user']
['input', 'user']


([({'input': True, 'user': True}, 'category')],
 ['input', 'user'],
 {'category': 'answer to give'})

In [41]:
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc, 'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines ]
        return data

In [37]:
p = Path(r'E:/MSA/Fall 2023/Big Data/GP/Health-Care-Chat-Bot-using-Decision-Tree-algorithm-main/data.txt')

In [42]:
filename = p
data = get_content(filename)

In [43]:
data

[['symptomA\tsymptomB\tsymptomC\tproblem'],
 ['High engine temperature\tAfter a long drive\tThe air conditioner is on most of the time\tcheck the ventilation unit'],
 ['High engine temperature\tAfter a long drive\tAir conditioner is off most of the time\tThe alternator belt may be slipping or the fan clutch may be damaged'],
 ['Car exhaust smoke\tblack smoke\tThe smoke comes out of the exhaust while driving\tCheck if the air filter is clean. If so then check the fuel injection sensor'],
 ['Car exhaust smoke\tblack smoke\tThe smoke comes out of the exhaust when starting and then stops\tThe engine sensor may be faulty'],
 ['Car exhaust smoke\twhite smoke\tThe smoke comes out of the exhaust while driving even after the engine has warmed up\tA new head gasket may be needed'],
 ['Car exhaust smoke\twhite smoke\tThe smoke comes out of the exhaust while driving and stops after the engine has warmed up\tThis is a normal situation'],
 ['Car exhaust smoke\twhite smoke\tcold weather\tThis is a no

In [44]:
features_data, corpus, answers = extract_feature_from_doc(data)

ValueError: not enough values to unpack (expected 3, got 1)

In [30]:
print(features_data[100])

({'annual': True, 'leav': True, 'balanc': True}, 'Balance-Annual-Leaves')


In [31]:
corpus

['hello',
 'hi',
 'hello',
 'hi',
 'hi',
 'hi',
 'hey',
 'hello',
 'hi',
 'hey',
 'hey',
 'hi',
 'hey',
 'hello',
 'good',
 'morn',
 'good',
 'afternoon',
 'good',
 'even',
 'good',
 'night',
 'today',
 'want',
 'help',
 'need',
 'help',
 'help',
 'want',
 'help',
 'want',
 'assist',
 'help',
 'great',
 'talk',
 'great',
 'thank',
 'help',
 'thank',
 'thank',
 'much',
 'thank',
 'thank',
 'much',
 'mani',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'mani',
 'leav',
 'taken',
 'mani',
 'leav',
 'alreadi',
 'taken',
 'mani',
 'annual',
 'leav',
 'mani',
 'annual',
 'leav',
 'taken',
 'mani',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'count',
 'taken',
 'mani',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'numbe

In [32]:
answers

{'Greetings': 'Hello. I am Dexter. I will serve your leave enquiries.',
 'Morning': 'Good Morning. I am Dexter. I will serve your leave enquiries.',
 'Afternoon': 'Good afternoon. I am Dexter. I will serve your leave enquiries.',
 'Evening': 'Good evening. I am Dexter. I will serve your leave enquiries.',
 'Goodbye': 'Good night. Take care.',
 'Opening': "I'm fine! Thank you. How can I help you?",
 'Help': 'How can I help you?',
 'No-Help': 'Ok sir/madam. No problem. Have a nice day.',
 'Closing': "It's glad to know that I have been helpful. Have a good day!",
 'Leaves-Type': 'Currently I know about two: annual and optional leaves.',
 'Default-Utilized-Annual-Leaves': 'You have used 12 annual leaves.',
 'Utilized-Annual-Leaves': 'You have taken 12 annual leaves.',
 'Utilized-Optional-Leaves': 'You have taken 1 optional leaves.',
 'Default-Balance-Annual-Leaves': 'You have 25 annual leaves left.',
 'Balance-Annual-Leaves': 'You have 25 annual leaves remaining.',
 'Balance-Optional-Leave

# Train a model using these features

In [33]:
## split data into train and test sets
split_ratio = 0.8

In [76]:
def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [77]:
training_data, test_data = split_dataset(features_data, split_ratio)

In [78]:
training_data

[({'mani': True, 'option': True, 'leav': True}, 'Utilized-Optional-Leaves'),
 ({'remain': True, 'annual': True, 'leav': True}, 'Balance-Annual-Leaves'),
 ({'mani': True, 'type': True, 'leav': True}, 'Leaves-Type'),
 ({'number': True, 'option': True, 'leav': True, 'remain': True},
  'Balance-Optional-Leaves'),
 ({'want': True, 'assist': True}, 'No-Help'),
 ({'option': True, 'leav': True, 'alreadi': True, 'taken': True},
  'Utilized-Optional-Leaves'),
 ({'mani': True, 'option': True, 'leav': True}, 'Utilized-Optional-Leaves'),
 ({'good': True, 'even': True}, 'Evening'),
 ({'taken': True, 'annual': True, 'leav': True}, 'Utilized-Annual-Leaves'),
 ({'mani': True, 'annual': True, 'leav': True}, 'Utilized-Annual-Leaves'),
 ({'mani': True, 'annual': True, 'leav': True}, 'Balance-Annual-Leaves'),
 ({'number': True, 'annual': True, 'leav': True, 'remain': True},
  'Balance-Annual-Leaves'),
 ({'option': True, 'leav': True, 'balanc': True}, 'Balance-Optional-Leaves'),
 ({'option': True, 'leav': T

In [79]:
# save the data
np.save('training_data', training_data)
np.save('test_data', test_data)

## Classification using Decision tree

In [100]:
np_load_old = np.load
np.load = lambda *a: np_load_old(*a, allow_pickle=True)
training_data = np.load('training_data.npy')
test_data = np.load('test_data.npy')

In [102]:
def train_using_decision_tree(training_data, test_data):
    
    classifier = nltk.classify.DecisionTreeClassifier.train(training_data, entropy_cutoff=0.5, support_cutoff=6)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy: ', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy: ', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [103]:
dtclassifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

training set accuracy:  0.9210526315789473
test set accuracy:  0.7931034482758621


## Classification using Naive Bayes

In [88]:
def train_using_naive_bayes(training_data, test_data):
    classifier = nltk.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [89]:
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_naive_bayes(training_data, test_data)
print(training_set_accuracy)
print(test_set_accuracy)
print(len(classifier.most_informative_features()))
classifier.show_most_informative_features()

0.8596491228070176
0.6551724137931034
66
Most Informative Features
                    leav = None           Greeti : Balanc =     12.0 : 1.0
                    mani = True           Defaul : Balanc =      6.7 : 1.0
                   taken = None           Balanc : Utiliz =      3.9 : 1.0
                 alreadi = True           Defaul : Utiliz =      3.4 : 1.0
                   count = True           Utiliz : Utiliz =      3.0 : 1.0
                   thank = None           Balanc : Closin =      2.7 : 1.0
                   carri = None           Balanc : CF     =      2.6 : 1.0
                  remain = None           Utiliz : Balanc =      2.3 : 1.0
                      hi = None           Balanc : Greeti =      2.2 : 1.0
                  number = True               CF : Balanc =      2.1 : 1.0


## Test on Navie Bayes Classifier

In [90]:
classifier.classify(({'mani': True, 'option': True, 'leav': True}))

'Utilized-Optional-Leaves'

In [91]:
extract_feature("hello")

['hello']

In [92]:
word_feats(extract_feature("hello"))

{'hello': True}

In [93]:
input_sentence = "how many balanced leaves do I have?"
classifier.classify(word_feats(extract_feature(input_sentence)))

'Utilized-Optional-Leaves'

## Test on Decision Tree Classifier

In [94]:
def reply(input_sentence):
    category = dtclassifier.classify(word_feats(extract_feature(input_sentence)))
    return answers[category]

In [95]:
reply('Hi')

'Hello. I am Dexter. I will serve your leave enquiries.'

In [96]:
reply('How many annual leaves do I have left?')

'You have 25 annual leaves remaining.'

In [97]:
reply('How optional leave left?')

'You have 2 optional leaves remaining.'

In [98]:
reply('How many leaves I have taken?')

'You have used 12 annual leaves.'

In [99]:
reply('Thanks!')

"It's glad to know that I have been helpful. Have a good day!"