## Übung 6 - Vanessa Schreck, Lisa Mattrisch
# Percepron Learning

In [1]:
import pandas as pd
import numpy as np
import random
import itertools
from sklearn.model_selection import train_test_split

### Loading Data

In [2]:
dataframe = pd.read_csv('~/Dokumente/Mustererkennung_Rojas/Iris_Data/iris_data.txt',
                        header = None)
data_array = np.array(dataframe)

### Separate data by class:
data  = {"Iris-setosa": [],
         "Iris-versicolor": [],
         "Iris-virginica": []
               }
data_labels = ["Iris-setosa",
               "Iris-versicolor",
               "Iris-virginica"
               ]

for row in data_array:
    for label in data_labels:
        if row[-1] == label:
            data[label].append(row[:-1])

### Divide data (each class individually) in train set and test set (80-20):
train  = {}
test   = {}
for label in data_labels:
    train[label], test[label] = train_test_split(np.array( data[label] ),
                                                 test_size=0.2,
                                                 random_state=13
                                                 )

### Classifier

In [3]:
class Classifier:
    def __init__(self,
                 max_iter = 10000,
                 do_pocket = False,
                 ):
        self.max_iter = max_iter      ### Algo should stop after max_iter iterations.
        self.do_pocket = do_pocket    ### Indicates whether or not pocket algo should be used
    
    def fit(self,
            train_label1, # positive
            train_label2, # negative
            label1,
            label2
           ):
        
        self.pos_label = label1
        self.neg_label = label2
        
        ### Extend the data by an additional dimension to avoid a constant term theta:
        num_label1 = len(train_label1)
        num_label2 = len(train_label2)
        self.num_data = num_label1 + num_label2

        train_label1 = np.concatenate((np.ones(num_label1)[np.newaxis,:].T,
                                       train_label1),
                                       axis = 1
                                     )
        train_label2 = np.concatenate((np.ones(num_label2)[np.newaxis,:].T,
                                       train_label2),
                                       axis = 1
                                     )
        
        ### Negate the data of the negative label, so there is only one class left:
        train_label2 *= -1
        
        ### Merge the data of the two classes:
        self.train = np.concatenate((train_label1,
                                     train_label2
                                   ))

        ##################################
        ###          ALGORITHM         ###
        ##################################
        
        ### Initializing w_o to one data point (first in list):
        current_w = self.train[0]
        
        self.counter                = 0
        self.currently_correct = np.array( [False]* self.num_data )
        self.best_num_so_far   = 0
        while self.counter < self.max_iter:
            ### Check if we are done already:
            if sum(self.currently_correct) == self.num_data:
                break
            ### if not, perform iteration step:
            current_x, pos = self.get_random_data_point()
            if np.dot(current_x, current_w) >= 0:
                ### if this current_x is classified correctly by current_w,
                ### add an "is classified correctly"-flag corresponding to this current_x
                self.currently_correct[pos] = True
                ### If we want to perform the pocket algo, check if this current_w is better
                ### than the so-far best w (pocket_w) and if yes, update pocket.
                if self.do_pocket and sum(self.currently_correct) > self.best_num_so_far:
                    self.best_num_so_far = sum(self.currently_correct)
                    self.pocket_w = current_w
                continue
            else:
                current_w += current_x
                ### Reset self.currentyl_correct to False for this new direction w:
                self.currently_correct = np.array( [False]* self.num_data )
                self.counter += 1
                
        ### Store separating hyperplane:
        if self.do_pocket:
            self.separating_w = self.pocket_w
        else:
            self.separating_w = self.current_w
            
    def get_random_data_point(self):
        pos_list = np.array(range(self.num_data))
        pos_list_falsely_class = pos_list[ ~self.currently_correct ]
        pos = random.choice(pos_list_falsely_class)
        x = self.train[pos]
        return x, pos
    
    def predict(self,test):
        ### Add an extra dimension, as with the training data:
        num_test = len(test)
        test = np.concatenate((np.ones(num_test)[np.newaxis,:].T,
                               test),
                               axis = 1
                             )
        
        ### For each data point, calculate the sign of its scalar prodocts with the separating w.
        pred_signs = np.sign(np.dot( test,
                                    self.separating_w.T
                                  )
                           )
        ### Convert the labels +1/-1 into string-labels:
        L = np.array([self.pos_label, self.neg_label])
        pred_labels = L[list((pred_signs == -1).astype(int))]
        
        return pred_labels
        
    def accuracy(self, test, labels):
        ### Calculate accuracy on the test set
        pred_labels = self.predict(test)
        acc = sum(pred_labels == labels)/float(len(test))
        return acc

In [4]:
cl = Classifier(do_pocket = True)

for i,j in itertools.combinations([0,1,2], 2):
    label1 = data_labels[i]
    label2 = data_labels[j]

    cl.fit(train[label1],
           train[label2],
           label1,
           label2
           )
    ### Glue together the test data (and corresponding labels) of the two classes that are considered.
    test_data   = np.concatenate((test[label1], test[label2]))
    test_labels = np.concatenate(([label1]*len(test[label1]), [label2]*len(test[label2])))

    acc = cl.accuracy(test_data, test_labels)
    print("Accurracy for separating labels " + str(label1) + " and " 
          + str(label2) + " is " + str(round(100*acc, 2)) + "%")

Accurracy for separating labels Iris-setosa and Iris-versicolor is 100.0%
Accurracy for separating labels Iris-setosa and Iris-virginica is 100.0%
Accurracy for separating labels Iris-versicolor and Iris-virginica is 95.0%
