In [None]:
# use natural language toolkit
import nltk
from nltk.stem.lancaster import LancasterStemmer
import os
import json
import datetime
from nltk.corpus import stopwords
stemmer = LancasterStemmer()
stopWords = set(stopwords.words('english'))

In [None]:
# 3 classes of training data
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
training_data.append({"class":"greeting", "sentence":"how is your day?"})
training_data.append({"class":"greeting", "sentence":"good day"})
training_data.append({"class":"greeting", "sentence":"how is it going today?"})

training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"see you later"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"talk to you soon"})

training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"})
training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"})
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})

In [None]:
import numpy as np
import time

 
def clean_text(t):
    t_clean = [stemmer.stem(word) for word in (t.lower()).split()]
    for i in range(0, len(t_clean)):
        t_clean[i] = ''.join(filter(str.isalnum, t_clean[i]))
    return t_clean


def create_basises(training_data):
    wordvec_basis = []
    labels = []
    for el in training_data:
        labels.append(el["class"])
        t_clean = clean_text(el["sentence"])
        for w in t_clean:
            if w not in stopWords: 
                wordvec_basis.append(w)
    return list(set(wordvec_basis)), list(set(labels))

# compute sigmoid nonlinearity
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

# convert output of sigmoid function to its derivative
def sigmoid_derivative(output):
    return output*(1-output)

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow2(sentence, words):
    # tokenize the pattern
    sentence_words = clean_text(sentence)
    # bag of words
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1

    return(np.array(bag))

def bow(t, words):
    t_clean = clean_text(t)
    t_vec = []
    for w in words:
        if w in t_clean:
            t_vec.append(1)
        else:
            t_vec.append(0)
    t_vec = np.array(t_vec)
    return t_vec


def make_guess(inp, weights, biases):
    guess = inp
    for n in range(len(weights)):
        guess = sigmoid(weights[n] @ guess + biases[n])
    return guess

def empty_wb(shape):
    weights = []
    biases = []
    for i in range(len(shape)-1):
        weights.append(np.zeros((shape[i+1], shape[i])))
        biases.append(np.zeros(shape[i+1]))
    return weights, biases

#Intitializes random weights and biases based on a given shape for the neural network
def init_random_wb(shape):
    weights = []
    biases= []
    for i in range(len(shape)-1):
        weights.append(2*(np.random.rand(shape[i+1], shape[i]) - 0.5))
        biases.append(2*(np.random.rand(shape[i+1]) - 0.5))
    return weights, biases

In [None]:
def label_to_vec(l, label_basis):
    l_vec = np.zeros(len(label_basis))
    for i in range(0, len(label_basis)):
        if l == label_basis[i]:
            l_vec[i] += 1
    return l_vec

def vectorize_dataset(data, word_basis, label_basis):
    text = []
    label = []
    i = 0
    for el in data:
        if i%100 == 0:
            print(i)
        i += 1
        t_vec = bow(el["sentence"], word_basis)
        l_vec = label_to_vec(el["class"], label_basis)
        text.append(t_vec)
        label.append(l_vec)
    return np.array(text), np.array(label)

def test_network(inp, out, weights, biases, label_basis):
    count = 0
    distribution = np.zeros(len(label_basis))
    for n in range(len(inp)):
        guess = np.argmax(make_guess(inp[n], weights, biases))
        correct = np.argmax(out[n])
        distribution[correct] += 1
        if guess == correct:
            count += 1
    return count / len(inp), distribution / len(inp)

In [None]:
def train(inp, out, N, shape, alpha):
    weights, biases = init_random_wb(shape)
    w_change, b_change = empty_wb(shape)
    guess = [0]*(len(weights) + 1)
        
    for n in range(N):
        print(n)
        for i in range(len(inp)):
            guess[0] = inp[i]
            for j in range(0,len(weights)):
                guess[j+1] = sigmoid(weights[j] @ guess[j] + biases[j])

            error = sigmoid_derivative(guess[len(weights)]) * (guess[len(weights)] - out[i])
            for j in range(len(weights)-1,-1,-1):
                w_change[j]  += np.outer(error, guess[j])
                b_change[j]  += error
                error =  weights[j].T @ (error)

            if i % 100 == 0:
                for j in range(len(weights)):
                    weights[j] -= alpha *  w_change[j]/100
                    biases[j] -= alpha *  b_change[j]/100
                w_change, b_change = empty_wb(shape)
    return weights, biases

In [None]:
words, classes = create_basises(training_data)
# create training data
training, output = vectorize_dataset(training_data,words, classes)
weights, biases = train(training, output, 1000, [len(words), 20, len(classes)], 0.5)

In [None]:
print(classes)

In [None]:
inp1 = bow("sudo make me a sandwich", words)
inp2 = bow("have a nice day", words) 
inp3 = bow("make me some lunch", words)
inp = np.array([inp1,inp2,inp3])
out = np.array([[0,0,1], [0,1,0], [0,0,1]])
count, dist = test_network(inp, out, weights, biases, classes)
print(count,"   ", dist)

In [None]:
data = open('corpus.txt', encoding = "utf8")
lines = data.readlines()
data.close()

#split data into training av testing - lists of dictionaries
training_data = []
testing_data = []
length = len(lines)

i = 0
for line in lines:
    content = line.split()
    if i < length * 0.9:
        training_data.append({"class":content[0], "sentence":" ".join(content[1:])})
    else:
        testing_data.append({"class":content[0], "sentence":" ".join(content[1:])})
    i += 1


In [None]:
word_basis, label_basis = create_basises(training_data)
train_data_inp, train_data_out = vectorize_dataset(training_data, word_basis, label_basis)
test_data_inp, test_data_out = vectorize_dataset(testing_data, word_basis, label_basis)

In [None]:
nn_shape = [len(word_basis), 20,20,  len(label_basis)]
weights, biases = train(train_data_inp[0:4500], train_data_out[0:4500], 10, nn_shape, 0.5)

In [None]:
count, dist = test_network(test_data_inp, test_data_out, weights, biases, label_basis)
print(count, dist)