In [5]:
import json 
import numpy as np 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

## Data Preparation

In [13]:
# read the json file
with open('data/intents.json') as file:
    data = json.load(file)

training_sentences = []
training_labels = [] 
labels = []
responses = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        training_sentences.append(pattern)
        training_labels.append(intent['tag'])
    responses.append(intent['responses'])

    if intent['tag'] not in labels:
        labels.append(intent['tag'])

num_classes = len(labels)

In [14]:
# label encoder (read more about this)
# convert categorical text labels into a numerical format 

# map your words into numbers 
lbl_encoder = LabelEncoder() 

# assigns every unique category a unique integer 
lbl_encoder.fit(training_labels)

# swaps the the text labels  for the integers during the fit step
training_labels = lbl_encoder.transform(training_labels)


## Tokenization

In [15]:
# Vectorize the data using Tokenization method 

# keeps only the 1000 most frequent words in the dataset. Everything else will be ignored 
vocab_size = 1000

# each word will eventually be represented by a vector of 16 numbers 
embedding_dim = 1

# sets a uniform length for the input. Every sentence will be 20 "tokens" long
max_len = 20

# "out of vocabulary" if the model encounters a word it doesnt recognize, it will replace it with this tag instead of skipping it
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

# scans your dataset and creates a dictionary where every unique word is map to a unique integer
tokenizer.fit_on_texts(training_sentences)

# save the dictionary 
word_index = tokenizer.word_index

# converts the actual sentences into a list of numbers  
sequences = tokenizer.texts_to_sequences(training_sentences)

# neural network requires input data to have fixed shape 
# if its too short then add '0' at the end of the "pad"
# if its too long then cut off the end of the sentences w the setting "post"
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

## Training the Neural Network