In [5]:
import json 
import numpy as np 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

## Data Preparation

In [18]:
# read the json file
with open('data/intents.json') as file:
    data = json.load(file)

training_sentences = []
training_labels = [] 
labels = []
responses = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        training_sentences.append(pattern)
        training_labels.append(intent['tag'])
    responses.append(intent['responses'])

    if intent['tag'] not in labels:
        labels.append(intent['tag'])

num_classes = len(labels)

In [19]:
# label encoder (read more about this)
# convert categorical text labels into a numerical format 

# map your words into numbers 
lbl_encoder = LabelEncoder() 

# assigns every unique category a unique integer 
lbl_encoder.fit(training_labels)

# swaps the the text labels  for the integers during the fit step
training_labels = lbl_encoder.transform(training_labels)


## Tokenization

In [22]:
# Vectorize the data using Tokenization method 

# keeps only the 1000 most frequent words in the dataset. Everything else will be ignored 
vocab_size = 1000

# each word will eventually be represented by a vector of 16 numbers 
embedding_dim = 16

# sets a uniform length for the input. Every sentence will be 20 "tokens" long
max_len = 20

# "out of vocabulary" if the model encounters a word it doesnt recognize, it will replace it with this tag instead of skipping it
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

# scans your dataset and creates a dictionary where every unique word is map to a unique integer
tokenizer.fit_on_texts(training_sentences)

# save the dictionary 
word_index = tokenizer.word_index

# converts the actual sentences into a list of numbers  
sequences = tokenizer.texts_to_sequences(training_sentences)

# neural network requires input data to have fixed shape 
# if its too short then add '0' at the end of the "pad"
# if its too long then cut off the end of the sentences w the setting "post"
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

## Training the Neural Network

In [23]:
# data flows through the layers in order, top to bottom 
model = Sequential()

# it turns each word (represented by an integer) into a dense vector of fixed size(16). Words with similar meanings will move closer together in this dim space
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))

# "flattens" the data. it takes average of all the word vectors in a sentence, creating a single fixed-length vector that represents the entire senctence's meaning
model.add(GlobalAveragePooling1D())

# "hidden layer" these are standard neural network layers with 16 neurons. They use relu to learn complex patterns and relationship between words
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))

# "output layer" the final layer has "num_classes" neurons (one for each possible chatbot response category). it uses softmax to turn the output into probabilities that add up to 100% 
model.add(Dense(num_classes, activation='softmax'))

# sparse... : the loss function that calcuate on how wrong the model is 
# adam: the algo that updates the weights of the neurons to reduce the loss 
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

# one epoch is one full pass through the entire dataset. Training for 500 epochs means the model will look at our sentences 500 times to refine its understanding
epochs = 500 

# the model takes the "questions" (padded sequences) and the "answers" (training_labels) and begins the trial and error process of learning
history = model.fit(padded_sequences, np.array(training_labels), epochs=epochs)

model.summary()

Epoch 1/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 188ms/step - accuracy: 0.0909 - loss: 2.0808
Epoch 2/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0909 - loss: 2.0780
Epoch 3/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.1818 - loss: 2.0771
Epoch 4/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2121 - loss: 2.0764
Epoch 5/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.1515 - loss: 2.0759
Epoch 6/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.1515 - loss: 2.0755
Epoch 7/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1515 - loss: 2.0749
Epoch 8/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1515 - loss: 2.0746
Epoch 9/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

## Saving the neural network

In [25]:
# read more about this later
model.save("chat_model.keras")

import pickle 

# save the fitted tokenizer
with open("tokenizer.pickle", 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# to save the fitted label encoder 
with open('label_encoder.pickle', 'wb') as ecn_file:
    pickle.dump(lbl_encoder, ecn_file, protocol=pickle.HIGHEST_PROTOCOL)