In [2]:
import numpy as np
from utilities import prepare_set

In [3]:
X_train, y_train, X_test, y_test, train_weights, test_weights = prepare_set('../data/D0_set_weighted.npy')

In [4]:
X_train.shape

(104768, 9)

In [5]:
X_test.shape

(11641, 9)

>S - ratio of # of classified signal (1) to # of counted signal in X_test  
B - ratio of # of classified background (0) to # of counted background in y_test

## ML model

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV

no_train = 2500 # number of train samples
no_test = 1500 # number of test samples
X=X_train[:no_train, 1:]
y=y_train[:no_train]

def eff_signal(clf):
    '''How much of signal is classified as signal.'''
    test_count = 0
    fit_count = 0
    prediction = clf.predict(X_test[:no_test, 1:])
    for i in range(no_test):
        if y_test[i] == 1:
            test_count += 1
            if (y_test[i] == prediction[i]):
                fit_count += 1
    return fit_count / test_count

def eff_background(clf):
    '''How much of background is classified as background.'''
    test_count = 0
    fit_count = 0
    prediction = clf.predict(X_test[:no_test, 1:])
    for i in range(no_test):
        if y_test[i] == 0:
            test_count += 1
            if (y_test[i] == prediction[i]):
                fit_count += 1
    return fit_count / test_count
    
def high_score(clf):
    '''Function for scoring argument in model optimization'''
    S = eff_signal(clf)
    B = eff_background(clf)
    return S/np.sqrt(S+B)

---
### Logistic Regression

In [7]:
log_reg = LogisticRegression()
log_reg = log_reg.fit(X, y)

In [18]:
print('S= {:.3f}, B= {:.3f}, S*(S+B)^1/2= {:.3f}'.format(eff_signal(log_reg), 
                                                         eff_background(log_reg), high_score(log_reg)))

S= 0.244, B= 0.942, S*(S+B)^1/2= 0.224


In [8]:
log_reg.score(X_test[:no_test, 1:], y_test[:no_test])

0.738

---
### SVC

In [9]:
svc = SVC()
svc.fit(X, y)

SVC()

In [14]:
print('S= {:.3f}, B= {:.3f}, S*(S+B)^1/2= {:.3f}'.format(eff_signal(svc), 
                                                         eff_background(svc), high_score(svc)))

S= 0.210, B= 0.964, S*(S+B)^1/2= 0.194


In [101]:
svc.score(X_test[:no_test, 1:], y_test[:no_test])

0.7266666666666667

---
### Decision Tree

In [17]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X,y)

DecisionTreeClassifier()

In [19]:
print('S= {:.3f}, B= {:.3f}, S*(S+B)^1/2= {:.3f}'.format(eff_signal(dec_tree), 
                                                         eff_background(dec_tree), high_score(dec_tree)))

S= 0.731, B= 0.887, S*(S+B)^1/2= 0.574


In [103]:
dec_tree.score(X_test[:no_test, 1:], y_test[:no_test])

0.8286666666666667