In [37]:
# things we need for NLP
import joblib
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [38]:
# things we need for Tensorflow
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import tflearn
import tensorflow as tf
import random
import pandas as pd
import re
import string
from nltk.stem import WordNetLemmatizer
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [39]:
# import our chat-bot intents file
import json
with open('data/intents.json') as json_data:
    intents = json.load(json_data)

files = glob.glob('models/*')
for f in files:
    os.remove(f)

In [40]:
words = []
classes = []
documents = []
ignore_words = ['?']
# loop through each sentence in our intents patterns
for intent in intents['intents']:
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w = nltk.word_tokenize(pattern)
        # add to our words list
        words.extend(w)
        # add to documents in our corpus
        documents.append((w, intent['tag']))
        # add to our classes list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

# remove duplicates
classes = sorted(list(set(classes)))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)

95 documents
16 classes ['genres', 'goodbye', 'highest grossing', 'languages_ar', 'languages_cn', 'languages_de', 'languages_en', 'languages_es', 'languages_fr', 'languages_hi', 'languages_it', 'languages_ja', 'languages_kr', 'languages_pt', 'languages_ru', 'most popular']
63 unique stemmed words ['admir', 'ar', 'arab', 'as', 'best', 'bye', 'categ', 'chin', 'chines', 'demand', 'desir', 'earn', 'engl', 'favo', 'favourit', 'film', 'frant', 'french', 'genr', 'germ', 'germany', 'goodby', 'gross', 'had', 'highest', 'hind', 'in', 'ind', 'is', 'it', 'ita', 'jap', 'japanes', 'known', 'kor', 'lang', 'langu', 'lat', 'latin', 'latino', 'lik', 'list', 'mad', 'money', 'most', 'movy', 'of', 'pop', 'popul', 'portugues', 'russ', 'see', 'slav', 'sort', 'sought-after', 'span', 'the', 'ther', 'typ', 'want', 'what', 'which', 'you']


In [41]:
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])

  training = np.array(training)


In [42]:
# reset underlying graph data
tf.compat.v1.reset_default_graph()
# Build neural network
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)

# Define model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
# Start training (apply gradient descent algorithm)
history = model.fit(train_x, train_y, n_epoch=500, batch_size=8, show_metric=True)
model.save('models/model.tflearn')

Training Step: 5999  | total loss: [1m[32m0.69567[0m[0m | time: 0.028s
| Adam | epoch: 500 | loss: 0.69567 - acc: 0.7498 -- iter: 88/95
Training Step: 6000  | total loss: [1m[32m0.66251[0m[0m | time: 0.030s
| Adam | epoch: 500 | loss: 0.66251 - acc: 0.7623 -- iter: 95/95
--
INFO:tensorflow:C:\Users\Daniel Krasovski\Documents\GitHub\Project_Oreo\models\model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [43]:
import pickle
pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "models/training_data", "wb" ) )

