# Aufgabe 1 - k-NN Klassifikation

## Teilaufgabe a)

## Teilaufgabe b)

In [2]:
import numpy as np
import pandas as pd

In [3]:
def dist(a,b):
    return np.linalg.norm(a-b)


class KNN:
    '''KNN Classifier.

    Attributes
    ----------
    k : int
        Number of neighbors to consider.
    '''
    def __init__(self, k):
        '''Initialization.
        Parameters are stored as member variables/attributes.
        
        Parameters
        ----------
        k : int
            Number of neighbors to consider.
        '''
        self.k = k

    def fit(self, X, y):
        '''Fit routine.
        Training data is stored within object.
        
        Parameters
        ----------
        X : numpy.array, shape=(n_samples, n_attributes)
            Training data.
        y : numpy.array shape=(n_samples)
            Training labels.
        '''
        self.training_data = X
        self.training_labels = y

    def predict(self, X):
        '''Prediction routine.
        Predict class association of each sample of X.
        
        Parameters
        ----------
        X : numpy.array, shape=(n_samples, n_attributes)
            Data to classify.
        
        Returns
        -------
        prediction : numpy.array, shape=(n_samples)
            Predictions, containing the predicted label of each sample.
        '''
        
        # First we need to find a few parameters
        n_attributes, n_samples = X.shape # for whatever reason no parentheses after shape...
        n_training_data = np.size(self.training_data, axis = 1) # this means finding the amount of columns
        distance = np.empty(shape = (n_training_data, n_samples)) # We store the distances of sample-point x_i to the training
        # points in the i-th column!
        
        # Now we find the distance to each point and store it in a matrix
        # If x_i is a point in the sample and t_j is a point in the training set, then distance[i,j] is the distance beetween them
        for i in range(0,n_training_data): # Looping over all rows
            for j in range(0,n_samples): # Looping over all columns
                distance[i,j] = dist(X[:,j], self.training_data[:,i]) # We want to append the distance to each point 
        
        # Now we sort the indices and put it in the indexList
        indexList = np.argsort(distance, axis=0) # axis=0 means we want to sort down along the columns
        
        # In the zuordnungsmatrix we store the counts of the k nearest neighbors. For example:
        # zuordnungsmatrix[0,2] = 7 means that 7 of the nearest k neighbors of point x_2 are of label 0.
        zuordnungsmatrix = np.empty(shape = (2,n_samples))
        for i in range(0,n_samples):
            for j in range(0,self.k):
                if y[indexList[j,i]] == 0:
                    zuordnungsmatrix[0,i] += 1
                else:
                    zuordnungsmatrix[1,i] += 1
                    
        # We finally want to make the predictions. To to that we just compare the two values in each column
        prediction = np.zeros(n_samples)
        for i in range(0,n_samples):
            if zuordnungsmatrix[0,i] > zuordnungsmatrix[1,i]:
                prediction[i] = 0
            elif zuordnungsmatrix[0,i] < zuordnungsmatrix[1,i]:
                prediction[i] = 1
            else:
                prediction[i] = 2
        return prediction

In [45]:
hdf = pd.read_hdf('NeutrinoMC.hdf5', key = 'Signal')
hits = hdf.NumberOfHits
x = hdf.x
y = hdf.y

# Dont know how to properly loop over the three arrays, they arent changed in the code below...?
#temp_array = [hits, x, y]
#for count in temp_array: 
    #count = np.asarray(count)
    #count = count[~np.isnan(count)] # ~ means not

hits = np.asarray(hits)
hits = hits[~np.isnan(hits)] # ~ means not 
x = np.asarray(x)
x = x[~np.isnan(x)]
y = np.asarray(y)
y = y[~np.isnan(y)]

matrix = np.matrix([hits, x, y])
print(matrix[:,0:3]) # Look at the first three columns to see if it worked properly

[[ 26.          38.          87.        ]
 [  6.69494405   7.74364571   7.41714979]
 [  2.20707621   3.57920725   2.96454426]]


In [5]:
#training = np.array([[1,1,2,2,2,5,4,5,5,5], [1,2,1,2,3,2,4,4,5,6]])
#y = np.array([0,0,0,0,0,1,1,1,1,1])
#data = np.array([[2,3,3,3,5], [4,2,1,4,1]])
#knn = KNN(2)
#knn.fit(training,y)
#print(knn.predict(data))

In [27]:
trainingDataSignal = matrix[:,0:5000]
testDataSignal = matrix[:,5000:15000]

In [52]:
hdf = pd.read_hdf('NeutrinoMC.hdf5', key = 'Background')
hits = hdf.NumberOfHits
x = hdf.x
y = hdf.y

hits = np.asarray(hits)
hits = hits[~np.isnan(hits)]
x = np.asarray(x)
x = x[~np.isnan(x)]
y = np.asarray(y)
y = y[~np.isnan(y)]

matrix = np.matrix([hits, x, y])
testDataBackground = matrix[:,0:20000]
trainingDataBackground = matrix[:,20000:25000]

trainingData = np.concatenate((trainingDataBackground, trainingDataSignal), axis=1)
print(trainingData.shape)
y1 = np.zeros(5000)
y2 = np.ones(5000)
labels = np.concatenate((y1,y2)) # Lets say 0 is background and 1 is signal

(3, 10000)


In [53]:
knn = KNN(10)
knn.fit(trainingData, labels)
%timeit knn.predict(trainingDataSignal[:,0:1])

1 loop, best of 3: 228 ms per loop
