# Exercise Sheet 1, Task 3

This is one possible solution for how to implement the classifier. I added another example to show how using multiple outputs has an advantage over a single binary output neuron. This new sample also adds the word "and" to the vocabulary. As a result, our Bag of Words vectors and the weight matrices will also change slightly.

In [16]:
import numpy as np

#Our document corpus
sample_strings = ["I really like cats", "I mostly like dogs",  "I really like dogs", "I like dogs and cats"]

def tokenize(sample):
    """
    A tokenizer to split the strings into tokens
    For simplicity we just split on whitespace.
    :param sample: A string to be tokenized
    :return: a list of tokens
    """
    tokens = []
    return sample.lower().split(" ")

def build_vocabulary(samples):
    """
    Builds a vocabulary from a list of strings
    :param samples: A list of strings
    :return: A list of unique tokens
    """
    vocabulary = []
    for sample in samples:
        tokens = tokenize(sample)
        for token in tokens:
            if token not in vocabulary:
                vocabulary.append(token)
    return vocabulary

vocabulary = build_vocabulary(sample_strings)
vocabulary.sort()
print(vocabulary)

['and', 'cats', 'dogs', 'i', 'like', 'mostly', 'really']


In [17]:
def build_bag_of_words(sample, vocabulary):
    """
    Builds a bag of words representation for a given sample
    :param sample: A string
    :param vocabulary: A list of unique tokens
    :return: A vector of shape (len(vocabulary),1)
    """
    tokens = tokenize(sample)
    bag_of_words = np.zeros((len(vocabulary),1))
    for token in tokens:
        for i, word in enumerate(vocabulary):
            if word == token:
                bag_of_words[i] += 1
    return bag_of_words

bow_samples = []
print(vocabulary)
for sample in sample_strings:
    bow_samples.append(build_bag_of_words(sample, vocabulary))
    #After adding our new BoW-sample we print the latest one.
    #We also use the .T attribute to transpose the vector to make it more readable.
    #We could also use the transpose() function for this.
    print("{}:{}".format(sample,bow_samples[-1].T))


['and', 'cats', 'dogs', 'i', 'like', 'mostly', 'really']
I really like cats:[[0. 1. 0. 1. 1. 0. 1.]]
I mostly like dogs:[[0. 0. 1. 1. 1. 1. 0.]]
I really like dogs:[[0. 0. 1. 1. 1. 0. 1.]]
I like dogs and cats:[[1. 1. 1. 1. 1. 0. 0.]]


In [18]:

#First, let's build our ideal classifier. This one looks at the words "cats" and "dogs"
good_weights = np.matrix([[0,1,0,0,0,0,0],[0,0,1,0,0,0,0]])
#But this one looks at the words "mostly" and "really". It works for the first two samples but not for the other ones.
bad_weights = np.matrix([[0,0,0,0,0,0,1],[0,0,0,0,0,1,0]])

In [19]:
class classifier:
    def __init__(self, weights):
        """
        :param weights: a numpy matrix of shape (2,7) containing the weights for the classifier
        """
        self.weights = weights

    def classify(self, bow_input):
        """
        Returns the output of applying the weights to a given input.
        The output is vector of length 2, and the first entry is the classifier output for the topic "cats",
        while the second entry is the classifier output for the topic "dogs"
        :param bow_input: a numpy array of length 7 containing the bag of words representation of the input sentence
        :return: a numpy matrix of shape (2,1) containing the classifier output
        """
        #'@' performs matrix multiplication in numpy
        return self.weights@bow_input

In [20]:
def labels_to_string(labels):
    """
    build a string from the classifier output to make the vector interpretable
    :param labels: a numpy matrix of shape (2,1) containing the classifier output
    :return: a string describing the classifier output
    """
    topics = []
    if labels[0] > 0:
        topics.append("cats")
    if labels[1] > 0:
        topics.append("dogs")
    return "Sentence is about {}.".format(" and ".join(topics)) if topics else "Sentence is about neither cats nor dogs."

In [21]:
good_classifier = classifier(good_weights)
bad_classifier = classifier(bad_weights)
for idx,sample in enumerate(bow_samples):
    good_output = good_classifier.classify(sample)
    bad_output = bad_classifier.classify(sample)
    print("Sample: {}".format(sample_strings[idx]))
    print("Good classifier output: {}. {}".format(good_output.reshape(2,), labels_to_string(good_output)))
    print("Bad classifier output: {}. {}\n".format(bad_output.reshape(2,), labels_to_string(bad_output)))

Sample: I really like cats
Good classifier output: [[1. 0.]]. Sentence is about cats.
Bad classifier output: [[1. 0.]]. Sentence is about cats.

Sample: I mostly like dogs
Good classifier output: [[0. 1.]]. Sentence is about dogs.
Bad classifier output: [[0. 1.]]. Sentence is about dogs.

Sample: I really like dogs
Good classifier output: [[0. 1.]]. Sentence is about dogs.
Bad classifier output: [[1. 0.]]. Sentence is about cats.

Sample: I like dogs and cats
Good classifier output: [[1. 1.]]. Sentence is about cats and dogs.
Bad classifier output: [[0. 0.]]. Sentence is about neither cats nor dogs.

