# Aufgabe 1 - k-NN Klassifikation

## Teilaufgabe a)

## Teilaufgabe b)

## Teilaufgabe c)

In [47]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist # Vektorisierte Funktion,
#um schnell den Abstand zwischen einer Menge an Punkten zu bestimmen und in einer Matrix zu speichern
from sklearn.model_selection import train_test_split # Um randomisierte Trainings- und Testsets zu erstellen
import random

In [42]:
class KNN:
    '''KNN Classifier.

    Attributes
    ----------
    k : int
        Number of neighbors to consider.
    '''
    def __init__(self, k):
        '''Initialization.
        Parameters are stored as member variables/attributes.
        
        Parameters
        ----------
        k : int
            Number of neighbors to consider.
        '''
        self.k = k

    def fit(self, X, y):
        '''Fit routine.
        Training data is stored within object.
        
        Parameters
        ----------
        X : numpy.array, shape=(n_samples, n_attributes)
            Training data.
        y : numpy.array shape=(n_samples)
            Training labels.
        '''
        self.training_data = X
        self.training_labels = y

    def predict(self, X):
        '''Prediction routine.
        Predict class association of each sample of X.
        
        Parameters
        ----------
        X : numpy.array, shape=(n_samples, n_attributes)
            Data to classify.
        
        Returns
        -------
        prediction : numpy.array, shape=(n_samples)
            Predictions, containing the predicted label of each sample.
        '''
        
        # First we need to find a few parameters
        n_attributes, n_samples = X.shape
        n_training_data = np.size(self.training_data, axis = 1) # this means finding the amount of columns
        
        prediction = []
        
        for i in range(X.shape[0]): # für jeden zu klassifizierenden Datenpunkt
            distanceList = []
            kNearestNeighbors = []
            back = 0
            sig = 0
            counter = 0
            a = X[i]
            if len(a.shape) != 2: # mache eben aus a ein 2d-Array mit nur einer Zeile, falls es noch keins ist.
                a = a.reshape(1, a.shape[0])
            distanceTest = cdist(a, self.training_data)[0] # Hierfür MUESSEN beide Arrays zweidimensional sein
            distanceList = np.stack((distanceTest, self.training_labels), axis=-1) # Klebe die beiden Arrays als Spalten zusammen!
            sortedDistanceList = distanceList[distanceList[:,0].argsort()] # Sortiere nach der ersten Spalte!
            kNearestNeighbors = sortedDistanceList[:self.k] # Nimm nur die ersten k Werte -> k nächste Nachbarn
            for l in range(0, self.k):
                if kNearestNeighbors[l,1] == 0:
                    back += 1
                else:
                    sig +=1
            if back > sig:
                prediction.append(0)
            elif back < sig:
                prediction.append(1)
            else:
                prediction.append(float('NaN'))
        return prediction

## Teilaufgabe d)

In [52]:
def getDataFromFile(filename, key):
    "Gets the hits, x and y from a NeutrinoMC-file, cuts any nans and returns them as a numpy matrix"
    hdf = pd.read_hdf(filename, key)
    hits = hdf.NumberOfHits
    x = hdf.x
    y = hdf.y
    hits = np.asarray(hits)
    hits = hits[~np.isnan(hits)] # ~ means not 
    x = np.asarray(x)
    x = x[~np.isnan(x)]
    y = np.asarray(y)
    y = y[~np.isnan(y)]
    return np.transpose(np.matrix([hits, x, y])) # Wir wollen die Attribute des Punktes x_i in der i-ten Zeile haben


def getKNNData(matrixSignal, matrixBackground, lenTraining, lenSignal, lenBackground):
    '''Takes two Matrizes matrixSignal and matrixBackground. Returns three numpy arrays:
    TrainingData contains 2*lenTraining points. TrainingData[0:lenTraining] is the background data used to train the knn classifier,
    TrainingData[lenTraining:2lenTraining] is the signal data used to train.
    TestDataSignal contains lenSignal signal points to test the model, TestDataBackground is analogous.'''
    matrixSignal.shuffle() # Sicherstellen, dass die Daten gut gemischt sind (aus Bibliothek random)
    matrixBackground.shuffle()
    matrixSignal = matrixTraining[:lenTraining+lenSignal:,] # Nimm bitte die ersten lenTraining+lenSignal
    matrixBackground = matrixTraining[:lenTraining+lenBackground:,]
    
    trainingDataBackground, testDataBackground = train_test_split(matrixBackground, test_size = lenBackground, train_size = lenTraining)
    trainingDataSignal, testDataSignal = train_test_split(matrixSignal, test_size = lenSignal, train_size = lenTraining)
    trainingData = np.concatenate((trainingDataBackground, trainingDataSignal), axis=1)
    
    return trainingData, testDataBackground, testDataSignal

In [56]:
lenTraining = 5000
lenSignal = 10000
lenBackground = 20000

hdf = pd.read_hdf('NeutrinoMC.hdf5', key = 'Signal')
hits = hdf.NumberOfHits
x = hdf.x
y = hdf.y

hits = np.asarray(hits)
hits = hits[~np.isnan(hits)] # ~ means not 
x = np.asarray(x)
x = x[~np.isnan(x)]
y = np.asarray(y)
y = y[~np.isnan(y)]

matrix = np.matrix([hits, x, y])
#matrix = getDataFromFile(filename='NeutrinoMC.hdf5', key='Signal')
print(matrix) # Looking at the matrix to see if it worked

[[ 26.          38.          87.         ...,  52.          25.          12.        ]
 [  6.69494405   7.74364571   7.41714979 ...,   7.71972272   7.64833004
    7.779087  ]
 [  2.20707621   3.57920725   2.96454426 ...,   3.19118159   3.48405199
    3.8578863 ]]


In [57]:
trainingDataSignal = matrix[:,0:lenTraining]
testDataSignal = matrix[:,lenTraining:lenTraining+lenSignal]
testDataSignal = np.transpose(testDataSignal)

In [58]:
hdf = pd.read_hdf('NeutrinoMC.hdf5', key = 'Background')
hits = hdf.NumberOfHits
x = hdf.x
y = hdf.y

hits = np.asarray(hits)
hits = hits[~np.isnan(hits)]
x = np.asarray(x)
x = x[~np.isnan(x)]
y = np.asarray(y)
y = y[~np.isnan(y)]

matrix = np.matrix([hits, x, y])
trainingDataBackground = matrix[:,0:lenTraining]
testDataBackground = matrix[:,lenTraining:lenBackground+lenTraining]
testDataBackground = np.transpose(testDataBackground)


trainingData = np.concatenate((trainingDataBackground, trainingDataSignal), axis=1)
trainingData = np.transpose(trainingData)

y1 = np.zeros(lenTraining)
y2 = np.ones(lenTraining)
labels = np.concatenate((y1,y2)) # 0 = Background, 1 = Signal

In [59]:
knn = KNN(10)
knn.fit(trainingData, labels)
predictionS = knn.predict(testDataSignal)
predictionB = knn.predict(testDataBackground)

In [60]:
predictionS = np.asarray(predictionS)
predictionB = np.asarray(predictionB)
tp = np.count_nonzero(predictionS == 1)
fn = np.count_nonzero(predictionS == 0)
fp = np.count_nonzero(predictionB == 1)
tn = np.count_nonzero(predictionB == 0)
print('tp', tp)
print('fn', fn)
print('fp', fp)
print('tn', tn)
Reinheit = tp/(tp+fp)
Effizienz = tp/(tp+fn)
Signifikanz = tp/(np.sqrt(tp+fp))
print('Reinheit =', Reinheit)
print('Effizienz =', Effizienz)
print('Signifikanz =', Signifikanz)

tp 9679
fn 242
fp 1824
tn 17829
Reinheit = 0.8414326697383291
Effizienz = 0.9756072976514464
Signifikanz = 90.2453700219


## Teilaufgabe e)

In [61]:
hits = hdf.NumberOfHits
hits = np.asarray(hits)
hits = hits[~np.isnan(hits)]
print(hits[0:20])
hits = np.log10(hits)

[  5.90000000e+01   6.40000000e+01   1.40000000e+01   2.02000000e+02
   3.00000000e+00   7.30000000e+01   1.83000000e+02   9.90900000e+03
   1.51000000e+02   8.79000000e+02   9.70000000e+01   1.33000000e+02
   3.00000000e+01   1.08000000e+03   1.17000000e+02   1.86000000e+02
   1.80000000e+01   5.00000000e+00   3.50000000e+01   1.85700000e+03]
