In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt                         # plotting
import seaborn as sn  
from sklearn.preprocessing import StandardScaler
from sklearn import cluster
from sklearn import neighbors               # includes kNN!
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from random import randrange
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier


# names=['Th','Ph','E','NrHits','NrBumps','E1','E1E9','E9E25','Z20','Z53','LatMom'], delimiter=','
emc_gam  = pd.read_csv("https://drive.google.com/uc?export=download&id=1TvbbCV-kJeNMuIORMyiMFgH_u_CQSZhX")
emc_neutron = pd.read_csv("https://drive.google.com/uc?export=download&id=1bY6ZPF3WLxfviYSSViuw1SEx1_sWxP46")

# This is needed for training the model. It has to know if it's right.
emc_gam["type"] = 1
emc_neutron["type"] = 0

# Shove them into one big dataset for faster plotting.
dataset = pd.concat([emc_gam, emc_neutron],ignore_index=True)


print(dataset)

             Th          Ph         E  NrHits  NrBumps        E1      E1E9  \
0       50.8457   -0.008022  1.673710      19        1  1.055470  0.667862   
1       22.9273 -119.515000  4.698040      19        1  2.908490  0.641230   
2      130.3010 -126.725000  2.773130      17        1  1.826860  0.680077   
3      160.2830   59.330500  3.099390      20        1  2.171110  0.735619   
4       89.4000  -15.521300  3.194310      21        1  2.400890  0.774224   
...         ...         ...       ...     ...      ...       ...       ...   
86867  125.2240  -51.632400  0.646651      11        1  0.406012  0.661388   
86868   48.2575   23.305300  1.096160      19        1  0.514298  0.558028   
86869   92.8038  -43.532700  0.995061      15        1  0.423894  0.457717   
86870  119.4840  128.111000  0.229870      14        1  0.076507  0.492713   
86871  109.1130 -171.772000  0.437142       9        1  0.187270  0.483033   

          E9E25       Z20       Z53    LatMom  type  
0      0.

In [2]:
#Normalizing our data set 

dataset['E'] = ( dataset.E - dataset.E.mean() ) / dataset.E.std()
dataset['E1']        = ( dataset.E1        - dataset.E1.mean() )        / dataset.E1.std()
dataset['NrHits']     = ( dataset.NrHits     - dataset.NrHits.mean() )     / dataset.NrHits.std()
dataset['E1E9']       = ( dataset.E1E9       - dataset.E1E9.mean() )       / dataset.E1E9.std()

dataset['Th']       = ( dataset.Th       - dataset.Th.mean() )       / dataset.Th.std()
dataset['Ph']       = ( dataset.Ph       - dataset.Ph.mean() )       / dataset.Ph.std()
dataset['NrBumps']       = ( dataset.NrBumps       - dataset.NrBumps.mean() )       / dataset.NrBumps.std()
dataset['E9E25']       = ( dataset.E9E25       - dataset.E9E25.mean() )       / dataset.E9E25.std()
dataset['Z20']       = ( dataset.Z20       - dataset.Z20.mean() )       / dataset.Z20.std()
dataset['Z53']       = ( dataset.Z53       - dataset.Z53.mean() )       / dataset.Z53.std()
dataset['LatMom']       = ( dataset.LatMom       - dataset.LatMom.mean() )       / dataset.LatMom.std()


In [3]:
def FOM(p, r):
    """"This fucntion will return the value of the figure of merit (FOM)
    p = model prediction
    r = what it actually is (reality)
    """
    signal = np.sum((p + r) ==2 ) # If both P and R are 1 then it's added up. AKA if it's predicted correctly.
    background = np.sum(p) - signal # 

    return signal /(signal + background)**1/2



def fit(dataset, parameter, test_size, classifier):

    ''' 
    This function will return arrays for the test and train data set for our data, given the most powerful parameters. 
    Moreover, it will also return the FOMs from the train and test data predictions. 

    Arguments:
        dataset     (pandas.DataFrame)      https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.htm. This should be the dataframe containing the data
        parameter   (array-like)            The columns in the given data frame that you wanna use.
        test_size   (float)                 The portion (0-1) of the test data.
        classifier  (sklearn classifier)    Whatever classifier you wanna use for the fit.
    Return:
        result_te   (np.array)              The result of the test data in a numpy array.
        result_tr   (np.array)              The result of the train data in a numpy array.
    '''

    X_train, X_test, Y_train, Y_test = train_test_split(dataset[parameter], dataset[['type']],test_size=test_size)

    classifier.fit(np.array(X_train), np.array(Y_train))

    Y_pred_train = classifier.predict(np.array(X_train))
    X_train['p']= Y_pred_train
    result_tr = X_train

    Y_pred_test = classifier.predict(np.array(X_test))
    X_test['p']= Y_pred_test
    result_te = X_test
    

    return np.array(result_te), np.array(result_tr), FOM(np.array(Y_pred_test), np.array(Y_test)), FOM(np.array(Y_pred_train), np.array(Y_train))

def ML(dataset,feats, vals, classifier, **kwargs):
    ''' 
    This function will return the fom, for different calssifiers, for test and train. 

    Arguments:
        dataset     (pandas.DataFrame)     https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.htm. This should be the dataframe containing the data
        feats       (array-like)           parameters that are used for the machine learning classifier
        vals        (array-like)           Values for the first parameter of the classifier. Usually stuff like number of neighbors or number of trees or whatever.
        classifier  (sklearn classifier)   Whatever classifier you wanna use for the fit.                       
        **kwargs    (any)                  Any keyword arguments that you want to pass to your classifier 
    Returns:
        fom_te   (np.array)              The array stores all the FOMs, from where the max arg will be taken.
        fom_tr   (np.array)              The array stores all the FOMs, from where the max arg will be taken.
    ''' 

    result_te = np.empty((len(vals),len(dataset)//2,len(feats)+1))
    result_tr = np.empty((len(vals),len(dataset)//2,len(feats)+1))
    fom_te = np.empty(len(vals))
    fom_tr = np.empty(len(vals))

    classifiers = [classifier(i,**kwargs) for i in vals]


    for i,k in enumerate(classifiers):
        try:
            result_te[i], result_tr[i], fom_te[i], fom_tr[i] = fit(dataset,feats,0.5,k)
            print(vals[k])
        except Exception:
            pass


    max=np.argmax(fom_te)
    print("test: ",vals[max], fom_te[max])

    max=np.argmax(fom_tr)
    print("train: ",vals[max], fom_tr[max])
    
    return fom_te, fom_tr

In [4]:
feats=['E','NrHits','E9E25','Z20','LatMom']
k_vals = np.arange(0,20,2)

def VaryingFeatures(k_vals, feats):
    for features in range(1):
        feats=['E','NrHits']

        #kN Optimizing
        kNfoms=ML(dataset,feats,k_vals,KNeighborsClassifier, weights='distance')

        #Plotting kN
        plt.figure(figsize = (10,10))
        plt.title(f"{features}")
        plt.plot(k_vals, kNfoms[0], label = "test")
        plt.plot(k_vals, kNfoms[1], label = "train")
        plt.xlabel("n_neighbours" )
        plt.ylabel("FOM")
        plt.legend()
        fig=plt.gcf()
        plt.show()
        fig.savefig(f"kNfoms_2-20")

VaryingFeatures(k_vals, feats)

  return self._fit(X, y)
  return self._fit(X, y)
