# Aufgabe 1 - k-NN Klassifikation

## Teilaufgabe a)

## Teilaufgabe b)

## Teilaufgabe c)

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist # Vektorisierte Funktion,
#um schnell den Abstand zwischen einer Menge an Punkten zu bestimmen und in einer Matrix zu speichern
from sklearn.model_selection import train_test_split # Um randomisierte Trainings- und Testsets zu erstellen
import random

In [2]:
class KNN:
    '''KNN Classifier.

    Attributes
    ----------
    k : int
        Number of neighbors to consider.
    '''
    def __init__(self, k):
        '''Initialization.
        Parameters are stored as member variables/attributes.
        
        Parameters
        ----------
        k : int
            Number of neighbors to consider.
        '''
        self.k = k

    def fit(self, X, y):
        '''Fit routine.
        Training data is stored within object.
        
        Parameters
        ----------
        X : numpy.array, shape=(n_samples, n_attributes)
            Training data.
        y : numpy.array shape=(n_samples)
            Training labels.
        '''
        self.training_data = X
        self.training_labels = y

    def predict(self, X):
        '''Prediction routine.
        Predict class association of each sample of X.
        
        Parameters
        ----------
        X : numpy.array, shape=(n_samples, n_attributes)
            Data to classify.
        
        Returns
        -------
        prediction : numpy.array, shape=(n_samples)
            Predictions, containing the predicted label of each sample.
        '''
        
        # First we need to find a few parameters
        n_attributes, n_samples = X.shape
        n_training_data = np.size(self.training_data, axis = 1) # this means finding the amount of columns
        
        prediction = []
        
        for i in range(X.shape[0]): # für jeden zu klassifizierenden Datenpunkt
            distanceList = []
            kNearestNeighbors = []
            back = 0
            sig = 0
            counter = 0
            a = X[i]
            if len(a.shape) != 2: # mache eben aus a ein 2d-Array mit nur einer Zeile, falls es noch keins ist.
                a = a.reshape(1, a.shape[0])
            distanceTest = cdist(a, self.training_data)[0] # Hierfür MUESSEN beide Arrays zweidimensional sein
            distanceList = np.stack((distanceTest, self.training_labels), axis=-1) # Klebe die beiden Arrays als Spalten zusammen!
            sortedDistanceList = distanceList[distanceList[:,0].argsort()] # Sortiere nach der ersten Spalte!
            kNearestNeighbors = sortedDistanceList[:self.k] # Nimm nur die ersten k Werte -> k nächste Nachbarn
            for l in range(0, self.k):
                if kNearestNeighbors[l,1] == 0:
                    back += 1
                else:
                    sig +=1
            if back > sig:
                prediction.append(0)
            elif back < sig:
                prediction.append(1)
            else:
                prediction.append(float('NaN'))
        return prediction

## Teilaufgabe d)

In [11]:
def getDataFromFile(filename, key):
    "Gets the hits, x and y from a NeutrinoMC-file, cuts any nans and returns them as a numpy matrix"
    hdf = pd.read_hdf(filename, key)
    hits = hdf.NumberOfHits
    x = hdf.x
    y = hdf.y
    hits = np.asarray(hits)
    hits = hits[~np.isnan(hits)] # ~ means not 
    x = np.asarray(x)
    x = x[~np.isnan(x)]
    y = np.asarray(y)
    y = y[~np.isnan(y)]
    return np.transpose(np.matrix([hits, x, y])) # Wir wollen die Attribute des Punktes x_i in der i-ten Zeile haben


def getKNNData(matrixSignal, matrixBackground, lenTraining, lenSignal, lenBackground):
    '''Takes two Matrizes matrixSignal and matrixBackground. Returns three numpy arrays:
    TrainingData contains 2*lenTraining points. TrainingData[0:lenTraining] is the background data used to train the knn classifier,
    TrainingData[lenTraining:2lenTraining] is the signal data used to train.
    TestDataSignal contains lenSignal signal points to test the model, TestDataBackground is analogous.'''
    #np.random.shuffle(matrixSignal)
    #np.random.shuffle(matrixBackground)
    matrixSignal = matrixSignal[:lenTraining+lenSignal:,] # Nimm bitte die ersten lenTraining+lenSignal
    matrixBackground = matrixBackground[:lenTraining+lenBackground:,]
    
    trainingDataBackground, testDataBackground = np.split(matrixBackground, [lenTraining])
    trainingDataSignal, testDataSignal = np.split(matrixSignal, [lenTraining])
    #trainingDataBackground, testDataBackground = train_test_split(matrixBackground, test_size = lenBackground, train_size = lenTraining, shuffle=False)
    #trainingDataSignal, testDataSignal = train_test_split(matrixSignal, test_size = lenSignal, train_size = lenTraining, suffle=False)
    trainingData = np.concatenate((trainingDataBackground, trainingDataSignal), axis=0)
    
    return trainingData, testDataBackground, testDataSignal

In [17]:
lenTraining = 5000
lenSignal = 10000
lenBackground = 20000

hdf = pd.read_hdf('NeutrinoMC.hdf5', key = 'Signal')
hits = hdf.NumberOfHits
x = hdf.x
y = hdf.y

hits = np.asarray(hits)
hits = hits[~np.isnan(hits)] # ~ means not 
x = np.asarray(x)
x = x[~np.isnan(x)]
y = np.asarray(y)
y = y[~np.isnan(y)]

matrix = np.matrix([hits, x, y])
matrixSignal = getDataFromFile(filename='NeutrinoMC.hdf5', key='Signal')
matrixBackground = getDataFromFile(filename='NeutrinoMC.hdf5', key='Background')
trainingData, testDataBackground, testDataSignal = getKNNData(matrixSignal, matrixBackground,
                                                              lenTraining, lenSignal, lenBackground)

In [8]:
matrix = np.matrix([hits, x, y])
trainingDataSignal = matrix[:,0:lenTraining]
testDataSignal = matrix[:,lenTraining:lenTraining+lenSignal]
testDataSignal = np.transpose(testDataSignal)

In [13]:
hdf = pd.read_hdf('NeutrinoMC.hdf5', key = 'Background')
hits = hdf.NumberOfHits
x = hdf.x
y = hdf.y

hits = np.asarray(hits)
hits = hits[~np.isnan(hits)]
x = np.asarray(x)
x = x[~np.isnan(x)]
y = np.asarray(y)
y = y[~np.isnan(y)]

matrix = np.matrix([hits, x, y])
trainingDataBackground = matrix[:,0:lenTraining]
testDataBackground = matrix[:,lenTraining:lenBackground+lenTraining]
testDataBackground = np.transpose(testDataBackground)


trainingData = np.concatenate((trainingDataBackground, trainingDataSignal), axis=1)
trainingData = np.transpose(trainingData)

print(testDataBackground)

y1 = np.zeros(lenTraining)
y2 = np.ones(lenTraining)
labels = np.concatenate((y1,y2)) # 0 = Background, 1 = Signal

[[  3.73000000e+02   2.36355963e+00   1.74686164e+00]
 [  1.20000000e+02   5.48955053e+00   5.33732257e+00]
 [  6.00000000e+01   2.26092150e-02   3.47029509e+00]
 [  5.71500000e+03   1.18193437e+00   4.25387775e+00]
 [  5.95000000e+02   7.27451435e+00   8.04014604e+00]
 [  2.00000000e+00   3.08803138e+00   3.98238954e+00]
 [  2.84400000e+03   8.03536855e+00   5.76637673e+00]
 [  2.82400000e+03   4.44680026e+00   7.28100854e+00]
 [  4.93000000e+02   2.92092939e+00   2.31106776e+00]
 [  3.75000000e+02   4.66034372e+00   5.42416314e+00]
 [  2.00000000e+00   6.45131256e+00   5.46592891e+00]
 [  1.89200000e+03   1.59506046e+00   1.53625534e+00]
 [  2.10000000e+02   3.59788571e+00   5.07650096e+00]
 [  3.00000000e+00   5.02354461e+00   2.89331447e+00]
 [  7.00000000e+00   5.33035236e+00   4.25688496e+00]
 [  4.70000000e+01   6.55526269e+00   7.81366351e+00]
 [  2.00000000e+00   8.82074488e+00   3.12579227e+00]
 [  1.32200000e+04   4.57974010e+00   7.11331702e+00]
 [  1.90000000e+01   7.27260

In [None]:
knn = KNN(10)
knn.fit(trainingData, labels)
predictionS = knn.predict(testDataSignal)
predictionB = knn.predict(testDataBackground)

In [None]:
predictionS = np.asarray(predictionS)
predictionB = np.asarray(predictionB)
tp = np.count_nonzero(predictionS == 1)
fn = np.count_nonzero(predictionS == 0)
fp = np.count_nonzero(predictionB == 1)
tn = np.count_nonzero(predictionB == 0)
print('tp', tp)
print('fn', fn)
print('fp', fp)
print('tn', tn)
Reinheit = tp/(tp+fp)
Effizienz = tp/(tp+fn)
Signifikanz = tp/(np.sqrt(tp+fp))
print('Reinheit =', Reinheit)
print('Effizienz =', Effizienz)
print('Signifikanz =', Signifikanz)

## Teilaufgabe e)

In [None]:
hits = hdf.NumberOfHits
hits = np.asarray(hits)
hits = hits[~np.isnan(hits)]
print(hits[0:20])
hits = np.log10(hits)