# Nevralt nettverk for tekstklassifisering

AI og maskinlæringen er hyppig brukte ord i media og jeg en stund vært interessert i å lære mer om hvordan det egentlig fungerer. Bakgrunnen for prosjektet er dermed egeninteresse for å forstå de grunnleggende ideene bak maskinlæringsalgoritmer ved å selv implementere noe fra bunnen av. Jeg valgte å se på et nevralt nettverk, sterkt inspirert av youtube-serien om emnet av 3Blue1Brown. 

Jeg er interessert i språk og syntes ideen om å bruke AI til å analysere naturlig språk var spennnede. I tillegg var jeg interessert i å lære mer om ulike verktøy for tekstbehandling. Dermed valgte jeg å bruke det nevrale nettverket til tekstklassifisering. 

Koden er egenskrevet, men jeg har lest om teorien bak og hentet inspirasjon til kode fra ulike kilder. For det nevrale nettverket har <a href="http://neuralnetworksanddeeplearning.com/chap2.html">kapittel 2</a> av Michael Nielsens gratis online bok om Nevralte nettverk vært til stor hjelp. For tekstanalysen har jeg delvis basert meg på <a href = "https://machinelearnings.co/text-classification-using-neural-networks-f5cd7b8765c6">denne </a> artikkelen. 

In [16]:
# importing necessary packages
import nltk
from nltk.stem.lancaster import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import numpy as np
import pickle
stemmer = LancasterStemmer()
stopWords = set(stopwords.words('english'))
datapath = '/../Data/'

**Funksjoner for å renske tekst-input og vektorisere datasett**


Jeg har i første omgang brukt "bag-of-words" til å vektorisere tekststrenger. Det vil si at hvert tekststreng representeres av en vektor av 0-er og 1-ere: for hvert ord i en forhåndsdefinert base-vektor vil vektoren ha enten en 0 dersom orden ikke finnes i tekststrengen, eller et 1-tall dersom ordet finnes der. Senere i oppgaven tester jeg om jeg får bedre presisjon ved å bruke en mer avansert form for vektorisering. 

In [2]:
#takes in a string and returns a list of stemmed lowercase words
def clean_text(t):
    t_clean = [stemmer.stem(word) for word in (t.lower()).split()]
    for i in range(0, len(t_clean)):
        t_clean[i] = ''.join(filter(str.isalnum, t_clean[i]))
    return t_clean


#creates a vector of words and a vector of labels to use as a basis for vectorising text and labels
def create_basises(training_data):
    wordvec_basis = []
    labels = []
    for el in training_data:
        labels.append(el["class"])
        t_clean = clean_text(el["sentence"])
        for w in t_clean:
            if w not in stopWords: 
                wordvec_basis.append(w)
    return list(set(wordvec_basis)), list(set(labels))

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(t, words):
    t_clean = clean_text(t)
    t_vec = []
    for w in words:
        if w in t_clean:
            t_vec.append(1)
        else:
            t_vec.append(0)
    t_vec = np.array(t_vec)
    return t_vec

#returns an array of 0s and 1s according to the label, based on label_basis
def label_to_vec(l, label_basis):
    l_vec = np.zeros(len(label_basis))
    for i in range(0, len(label_basis)):
        if l == label_basis[i]:
            l_vec[i] += 1
    return l_vec

#vectorises a dataset(list of dictionaries giving sentence and class of each element)
def vectorize_dataset(data, word_basis, label_basis):
    text = []
    label = []
    for el in data:
        t_vec = bow(el["sentence"], word_basis)
        l_vec = label_to_vec(el["class"], label_basis)
        text.append(t_vec)
        label.append(l_vec)
    return np.array(text), np.array(label)

**Hjelpefunksjoner og funksjoner for å teste og trene det nevrale netverket**

In [3]:
# compute the sigmoid function
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

# convert output of sigmoid function to its derivative
def sigmoid_derivative(output):
    return output*(1-output)

#Intitializes empty arrays for storing weights and biases based on a given shape for the neural network
def empty_wb(shape):
    weights = []
    biases = []
    for i in range(len(shape)-1):
        weights.append(np.zeros((shape[i+1], shape[i])))
        biases.append(np.zeros(shape[i+1]))
    return weights, biases

#Intitializes random weights and biases based on a given shape for the neural network
def init_random_wb(shape):
    weights = []
    biases= []
    for i in range(len(shape)-1):
        weights.append(2*(np.random.rand(shape[i+1], shape[i]) - 0.5))
        biases.append(2*(np.random.rand(shape[i+1]) - 0.5))
    return weights, biases

#returns the output-layer of the neural network based on input, weights and biases
def make_guess(inp, weights, biases):
    guess = inp
    for n in range(len(weights)):
        guess = sigmoid(weights[n] @ guess + biases[n])
    return guess

#computes the ideal change in all weights and biases based on one instance of input/output
def back_propagation(el_in,el_out, shape, weights, biases):
    w_change, b_change = empty_wb(shape)
    guess = [el_in]
    for j in range(0,len(weights)):
        guess.append(sigmoid(weights[j] @ guess[j] + biases[j]))

    error = sigmoid_derivative(guess[len(weights)]) * (guess[len(weights)] - el_out)
    for j in range(len(weights)-1,-1,-1):
        w_change[j]  = np.outer(error, guess[j])
        b_change[j]  = error
        error =  weights[j].T @ (error)
    return w_change, b_change

#training the network
def train(inp, out, N, shape, alpha, batch_size):
    weights, biases = init_random_wb(shape)
    w_temp, b_temp = empty_wb(shape)
        
    for n in range(N):
        for i in range(len(inp)):
            w_change, b_change = back_propagation(inp[i],out[i], shape, weights, biases)
            
            w_temp = [wt + wch for wt, wch in zip(w_temp, w_change)]
            b_temp = [bt + bch for bt, bch in zip(b_temp, b_change)]
            
            if i % batch_size == 0:
                for j in range(len(weights)):
                    weights[j] -= alpha *  w_temp[j]/100
                    biases[j] -= alpha *  b_temp[j]/100
                w_temp, b_temp = empty_wb(shape)
    return weights, biases

#testing the network
def test_network(inp, out, weights, biases, label_basis):
    count = 0
    distribution = np.zeros(len(label_basis))
    for n in range(len(inp)):
        guess = np.argmax(make_guess(inp[n], weights, biases))
        correct = np.argmax(out[n])
        distribution[correct] += 1
        if guess == correct:
            count += 1
    return count / len(inp), distribution / len(inp)

## Enkel test med lite datasett

Her består datasettet kun av 12 lignende og korte setninger som skal klassifiseres i 1 av 3 kategorier. 

In [4]:
# 3 classes of training data
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
training_data.append({"class":"greeting", "sentence":"how is your day?"})
training_data.append({"class":"greeting", "sentence":"good day"})
training_data.append({"class":"greeting", "sentence":"how is it going today?"})

training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"see you later"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"talk to you soon"})

training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"})
training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"})
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})

In [5]:
words, classes = create_basises(training_data)
training, output = vectorize_dataset(training_data,words, classes)
weights, biases = train(training, output, 1000, [len(words), 20, len(classes)], 0.5, 1)

In [6]:
test_data = []
test_data.append({"class":"sandwich", "sentence": "sudo make me a sandwich"})
test_data.append({"class":"sandwich", "sentence": "make me some lunch"})
test_data.append({"class":"goodbye", "sentence": "have a nice day"})

test_in, test_out = vectorize_dataset(test_data, words, classes)
count, dist = test_network(test_in, test_out, weights, biases, classes)
print("Precision: ",count, "\nDistribution: ", dist)

Precision:  1.0 
Distribution:  [0.33333333 0.         0.66666667]


## Klassifiserer anmeldeser fra Amazon som positive eller negative


Datasettet er hentet <a href="https://gist.github.com/kunalj101/ad1d9c58d338e20d09ff26bcc06c4235">herfra</a> og inneholder 10 000 korte anmeldelser merket som positive eller negative.

In [9]:
text = []
for el in training_data:
    text.append(" ".join(clean_text(el["sentence"])))
print(text)

vectorizer = TfidfVectorizer()
X = vectorizer.fit(text)
Y = vectorizer.transform(text)
print(vectorizer.vocabulary_)
Y = Y.toarray()
print(Y.shape)

words, classes = create_basises(training_data)
training, output = vectorize_dataset(training_data,words, classes)
weights, biases = train(Y, output, 1000, [len(Y[0]), 20, len(classes)], 0.5,1)


inp1 = " ".join(clean_text("sudo make me a sandwich"))
inp2 = " ".join(clean_text("have a nice day") )
inp3 = " ".join(clean_text("make me some lunch"))
print(classes)
inp = np.array([inp1,inp2,inp3])
X = (vectorizer.transform(inp)).toarray()
out = np.array([[0,1,0], [0,0,1], [0,1,0]])
count, dist = test_network(X, out, weights, biases, classes)
print("Precision: ",count, "\nDistribution: ", dist)


#IT WORKS HALELUJA

['how ar you', 'how is yo day', 'good day', 'how is it going today', 'hav a nic day', 'see you lat', 'hav a nic day', 'talk to you soon', 'mak me a sandwich', 'can you mak a sandwich', 'hav a sandwich today', 'whats for lunch']
{'how': 7, 'ar': 0, 'you': 23, 'is': 8, 'yo': 22, 'day': 2, 'good': 5, 'it': 9, 'going': 4, 'today': 20, 'hav': 6, 'nic': 14, 'see': 16, 'lat': 10, 'talk': 18, 'to': 19, 'soon': 17, 'mak': 12, 'me': 13, 'sandwich': 15, 'can': 1, 'whats': 21, 'for': 3, 'lunch': 11}
(12, 24)
['goodbye', 'greeting', 'sandwich']
Precision:  0.0 
Distribution:  [0.         0.66666667 0.33333333]


Laster inn data Fra filen corpus.txt

In [10]:
#Loading data
data = open('corpus.txt', encoding = "utf8")
lines = data.readlines()
data.close()

#split data into training and testing - lists of dictionaries
training_data = []
testing_data = []
length = len(lines)

i = 0
for line in lines:
    content = line.split()
    if i < length * 0.9:
        training_data.append({"class":content[0], "sentence":" ".join(content[1:])})
    else:
        testing_data.append({"class":content[0], "sentence":" ".join(content[1:])})
    i += 1
    if i% 1000 == 0:
        print(i)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


Gjør om datasettet til vektorer

In [23]:
word_basis, label_basis = create_basises(training_data)
train_data_inp, train_data_out = vectorize_dataset(training_data, word_basis, label_basis)
test_data_inp, test_data_out = vectorize_dataset(testing_data, word_basis, label_basis)

Trener nettverket

In [None]:
nn_shape = [len(train_data_inp[0]), 20,20,  len(train_data_out[0])]
weights, biases = train(train_data_inp, train_data_out, 10, nn_shape, 0.5, 50)

Tester nettverket

In [None]:
count, dist = test_network(test_data_inp, test_data_out, weights, biases, label_basis)
print("Precision: ",count, "\nDistribution: ", dist)

Lagrer det vektoriserte datasettet

In [24]:
def save_data():
    data_dict = {}
    data_dict["train_data_inp"] = train_data_inp
    data_dict["train_data_out"] = train_data_out
    data_dict["test_data_inp"] = test_data_inp
    data_dict["test_data_out"] = test_data_out

    with open(datapath+'data_file.pkl', 'wb') as f :
        pickle.dump(data_dict, f)

Henter data fra fil

In [18]:
def load_data():
    with open(datapath+'data_file.pkl', 'rb') as f :
        data_dict1 = pickle.load(f)

    train_data_inp = data_dict1["train_data_inp"]
    train_data_out = data_dict1["train_data_out"]
    test_data_inp = data_dict1["test_data_inp"]
    test_data_out = data_dict1["test_data_out"]

Lagrer weights and biases

In [29]:
def save_wb():
    wb_dict={}
    wb_dict["w"]=weights
    wb_dict["b"]=biases

    with open(datapath+'wb_file.pkl', 'wb') as f :
            pickle.dump(wb_dict, f)

a = 1