In [None]:
import numpy as np
import nltk
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords

# load the dataset
data = open('corpus.txt', encoding = "utf8")
lines = data.readlines()
data.close()

#split data into training av testing - lists of dictionaries
training_data = []
testing_data = []
length = len(lines)

i = 0
for line in lines:
    content = line.split()
    if i < length * 0.9:
        training_data.append({"label":content[0], "text":" ".join(content[1:])})
    else:
        testing_data.append({"label":content[0], "text":" ".join(content[1:])})
    i += 1


In [None]:
#create basis for vectorizing the text using training data, a list with dictionaries
#also creates a list of possible labels
stemmer = LancasterStemmer()
stopWords = set(stopwords.words('english'))

def clean_text(t):
    t_clean = [stemmer.stem(word) for word in (t.lower()).split()]
    for i in range(0, len(t_clean)):
        t_clean[i] = ''.join(filter(str.isalnum, t_clean[i]))
    return t_clean

def create_basises(training_data):
    wordvec_basis = []
    labels = []
    for el in training_data:
        if el["label"] not in labels:
            labels.append(el["label"])
        t_clean = clean_text(el["text"])
        for w in t_clean:
            if w not in wordvec_basis and w not in stopWords: 
                wordvec_basis.append(w)
    return wordvec_basis, labels

In [None]:
#takes in a text and a basis for a vectorizing the text. Cleans text and returns a word_vec according to the given basis
def text_to_vec(t, words):
    t_clean = clean_text(t)
    t_vec = np.zeros(len(words), dtype = "float")
    for i in range(0,len(words)):
        t_vec[i] += t_clean.count(words[i])
    t_vec = t_vec/t_vec.sum()
    return t_vec

def label_to_vec(l, label_basis):
    l_vec = np.zeros(len(label_basis))
    for i in range(0, len(label_basis)):
        if l == label_basis[i]:
            l_vec[i] += 1
    print(l_vec)
    return l_vec

In [None]:
def vectorize_dataset(data, word_basis, label_basis):
    vectorized_data = []
    for el in data:
        t_vec = text_to_vec(el["text"], word_basis)
        l_vec = label_to_vec(el["label"], label_basis)
        vectorized_data.append({"output":l_vec, "input":t_vec})
    return vectorized_data

In [None]:
t = "Hello, there handsome, you are great!  WOW a a a"
print(clean_text(t))

In [None]:
word_basis, label_basis = create_basises(training_data[0:10])
print(word_basis)
vec_set= vectorize_dataset(training_data[0:10], word_basis, label_basis)

print(training_data[0:2])
print("\n\n\n", vec_set)


In [None]:
#sigmoid function to normalize output
def sigmoid(x):
    return 1/(1+np.exp(-x))


#derivative of sigmoid output
def sigmoid_output_to_derivative(s_x):
    return s_x*(1-s_x)