### Import Libraries for NLP and TensorFlow

In [None]:
#important libraries
import numpy as np # provides fast mathematical function processing
import tensorflow as tf # machine learning framework
from tensorflow.keras.models import Sequential # for plain layers where each layer has exactly one input tensor and one output tensor
from tensorflow.keras.layers import Dense, Dropout # regular densely-connected neural network layer, applies dropout to the input
from tensorflow.keras.preprocessing.text import Tokenizer # vectorize text into integers
import random # generate random numbers

### Load the Data

In [None]:
#load chatbot intents 

import json
with open('Chatbot_Intents.json') as file:
  data=json.load(file)

### Text Pre-Processing with NLTK

In [None]:
# Initiate stemming object
# NLP:for example -- "roaster", "roasting", "roasts" ---> "roast"

import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [None]:
#run for first time
nltk.download('punkt')

# consist of unique stemmed words/tokens from patterns extended in this list. No duplicates
words = []
# consist of tag words from intent
labels = []
# consist of tokenized sentences from patterns appended in this list
doc_x = []
# consists of tag words from intent matching tokens in doc_x
doc_y = []

# loop through each sentences in the data/intent
for intent in data['intents']:
    # loop through each sentences in patterns in intent
    for pattern in intent['patterns']:
        # tokenize each words in the pattern in intent
        wrds = nltk.word_tokenize(pattern)
        # method iterates over its argument adding each element to the list by extending the list
        words.extend(wrds)
        # method adds its argument as a single element to the end of a list. Length of the list increase by one
        doc_x.append(wrds)
        doc_y.append(intent['tag'])
        
    if intent['tag'] not in labels:
        labels.append(intent['tag'])

# stems and lower case the words 
words = [stemmer.stem(w.lower()) for w in words if w != '?']
 
# set() removes duplicates, list() change into a list and sorted() sort in ascending order
words = sorted(list(set(words)))

labels = sorted(labels)

In [None]:
# consist of tokenized sentences from patterns appended in this list
print (len(doc_x), 'documents x --->', doc_x[:20]) 

In [None]:
# consists of tag words from intent matching tokens in doc_x
print (len(doc_y), 'documents y --->', doc_y[:20]) 

In [None]:
# consist of tag words from intent
print (len(labels), 'labels --->', labels[:20]) 

In [None]:
# consist of unique stemmed words/tokens from patterns
print (len(words), 'unique stemmed words', words[:30]) 

### Transformation of Text in the Corpus to Vector of Numbers as Input to ML Model

In [None]:
# creating training data from corpus. Change texts into array of numbers
# Bag of words (Bow) is a method to extract features from text documents. These features can be used to train ML model. 
# Bow creates a vocabulary of all the unique words in documents in the training set
# Bow disregards order in which they appear

X_train = []
y_train = []

# empty array for output
out_empty = [0 for _ in range(len(labels))]

# create bag of words for each sentences 
for x, doc in enumerate(doc_x):
    # initialize bag of words
    bag = []
    # stem and change all words to lower case
    wrds = [stemmer.stem(w.lower()) for w in doc]
    # use for loop to create an array of bag of words
    for w in words:
        bag.append(1) if w in wrds else bag.append(0)

    # output is a '0' for each tag and '1' for current tag
    output_row = out_empty[:]
    output_row[labels.index(doc_y[x])] = 1

    # result of 'bag' added to training list
    X_train.append(bag)
    # result of 'output_row' added to output list
    y_train.append(output_row)

# change to numpy array
X_train = np.array(X_train)
y_train = np.array(y_train)

In [None]:
# shows sparse vector (lots of sero values) due to big documents 
print(X_train)

In [None]:
print(y_train)

### Create Neural Network

In [None]:
# build model architecture
# dense 128 ---> unit or number of neurons
# droupout layers with rate of 0.5 are added to "turn off" neurons during training to prevent overfitting
# The length of teh vector = vocabulary size (how many unique words in the document without duplicates)
# categorical crossentropy loss function is used in multi-class classification tasks 

model=Sequential()
model.add(Dense(128, input_shape=(len(X_train[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(y_train[0]), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# summarize the architecture of the model
model.summary()

In [None]:
# train model
model.fit(X_train, y_train, epochs=700, batch_size=5)

In [None]:
test_loss, test_acc = model.evaluate(X_train, y_train)

### Transformation of User Input Text to Vector of Number

In [None]:
# function to change user input into array of numbers 

def bag_of_words(s, words):
    bag = [0 for _ in range(len(words))]

    s_words = nltk.word_tokenize(s)
    s_words = [stemmer.stem(word.lower()) for word in s_words]

    for se in s_words:
        # adds a counter to an iterable and returns it in a form of numbered object
        for i, w in enumerate(words):
            if w == se:
                bag[i] = 1
            
    return np.array(bag)

### ChatBot Response

In [None]:
# function to allow user input and chatbot response

def start_chat():
    # first sentence to initiate communication between bot and user
    print('Hello, My name is Bela the Robot. I will answer your questions about Technical Analysis and Financial Data Science. Input a Financial related word and I will help you! If you want to exit, type Bye')
    
    while True:
        # prompt user to respond
        user_input = input('User: ')
        # exit word to terminate the while loop
        if user_input.lower() == "bye":
            break
              
        # predict the correct label given user input and comparing it to the words in pattern of intent 
        results = model.predict(bag_of_words(user_input, words).reshape(-1,436))
        # returns the indices of the maximum values along an axis
        results_index = np.argmax(results)
        # return the label(tag) that best match the user input   
        user_tag = labels[results_index]
        #print(results.max()) # -- shows the highest probability for each chosen tag

        
        # condition set - only result with probability more than 0.9 will be considered correct respond
        if results.max() > 0.9:
        # prints out the responses form matching tag randomly
            for tag_selection in data['intents']:
                if tag_selection['tag'] == user_tag:
                    responses = tag_selection['responses']
            print(random.choice(responses))
        
        # user input with probability < 0.9, will get this message
        else:
            print("Sorry I didn't get that. Please try again or go to https://worlddatascience.tech/datapedia for more assistance")
         

In [None]:
start_chat()

Hello, My name is Bela the Robot. I will answer your questions about Technical Analysis and Financial Data Science. Input a Financial related word and I will help you! If you want to exit, type Bye
