In [None]:
import numpy as np
from utilities import prepare_set
import matplotlib.pyplot as plt

from time import time

In [None]:
X_train, y_train, X_test, y_test, train_weights, test_weights = prepare_set('../data/D0_set_weighted.npy')

In [None]:
X_train.shape

In [None]:
X_test.shape

>S - ratio of # of classified signal (1) to # of counted signal in X_test  
B - ratio of # of classified background (0) to # of counted background in y_test

## ML model

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

max_depth = [None, 10, 50, 100]
class_weight = [None, 'balanced']

decision_tree_pipe = DecisionTreeClassifier()

tree_param_grid = [{'max_depth':max_depth,
                  'class_weight':class_weight}]

tree_gs = GridSearchCV(estimator=decision_tree_pipe, 
                       param_grid=tree_param_grid, scoring='accuracy', 
                       cv=10, n_jobs=-1)


In [None]:
start_time = time()

tree_gs = tree_gs.fit(X_train[:100000, 1:], y_train[:100000])

print(f'Decision tree training time {time() - start_time}.')
print(f'Wydajnosc {high_score(tree_gs)}')
print(f'Decision tree best params {tree_gs.best_params_}')

### SVC

In [None]:
from sklearn.svm import SVC

value_param = [0.01, 0.1, 1.0, 10, 100, 1000]

svc_pipe = make_pipeline(StandardScaler(), PCA(n_components=3), SVC())

svc_param_grid = [{'C': value_param,
                  'gamma': value_param,
                  'class_weight': class_weight}]

svc_gs = GridSearchCV(estimator=svc_pipe, 
                       param_grid=svc_param_grid, scoring='accuracy', 
                       cv=10, n_jobs=-1)

In [None]:
start_time = time()

svc_gs = svc_gs.fit(X_train[:100000, 1:], y_train[:100000])

print(f'Support Vector Machine training time {time() - start_time}.')
print(f'Wydajnosc {high_score(svc_gs)}')
print(f'Decision tree best params {svc_gs.best_params_}')

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

# no_train = X_train.shape[0] # number of train samples
# no_test = X_test.shape[0] # number of test samples

no_train = 200 # number of train samples
no_test = 1000 # number of test samples

X=X_train[:no_train, 1:]
y=y_train[:no_train]

def eff_signal(clf):
    '''How much of signal is classified as signal.'''
    test_count = 0
    fit_count = 0
    prediction = clf.predict(X_test[:no_test, 1:])
    for i in range(no_test):
        if y_test[i] == 1:
            test_count += 1
            if (y_test[i] == prediction[i]):
                fit_count += 1
    return fit_count / test_count * no_test

def eff_background(clf):
    '''How much of background is classified as background.'''
    test_count = 0
    fit_count = 0
    prediction = clf.predict(X_test[:no_test, 1:])
    for i in range(no_test):
        if y_test[i] == 0:
            test_count += 1
            if (y_test[i] == prediction[i]):
                fit_count += 1
    return (1 - fit_count / test_count) * no_test * 500
    
def high_score(clf):
    '''Function for scoring argument in model optimization'''
    S = eff_signal(clf)
    B = eff_background(clf)
    return S/np.sqrt(S+B)

def custom_scorer(clf, X, y):
    signal_count = 0
    background_count = 0
    
    signal_corr_pred_count = 0
    background_corr_pred_count = 0
    
    pred = clf.predict(X_test[:no_test, 1:])
    
    for i in range(len(y)):
        if y[i] == 1:
            signal_count +=1
#             if y[i] == pred[i]
            
    return signal_count

scorer = make_scorer(custom_scorer, greater_is_better=True)

In [None]:
X.shape

---
### Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg = log_reg.fit(X, y)

In [None]:
print('S= {:.3f}, B= {:.3f}, S/(S+B)^1/2= {:.3f}'.format(eff_signal(log_reg), 
                                                         eff_background(log_reg), high_score(log_reg)))

In [None]:
log_reg.score(X_test[:no_test, 1:], y_test[:no_test])

---
### SVC

In [None]:
svc = SVC()
svc.fit(X, y)

In [None]:
print('S= {:.3f}, B= {:.3f}, S*(S+B)^1/2= {:.3f}'.format(eff_signal(svc), 
                                                         eff_background(svc), high_score(svc)))

In [None]:
svc.score(X_test[:no_test, 1:], y_test[:no_test])

---
### Decision Tree

In [None]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X,y)

In [None]:
print('S= {:.3f}, B= {:.3f}, S*(S+B)^1/2= {:.3f}'.format(eff_signal(dec_tree), 
                                                         eff_background(dec_tree), high_score(dec_tree)))

In [None]:
dec_tree.score(X_test[:no_test, 1:], y_test[:no_test])

NEW WAY

In [None]:
pca = PCA(n_components=8)

X_pca = pca.fit_transform(X)

pca.explained_variance_ratio_
plt.bar(range(1,9), pca.explained_variance_ratio_)
plt.step(range(1,9), np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.show()

In [None]:
tree_pipe = make_pipeline(DecisionTreeClassifier())

depth = [8, 15, 30]
param_grid = [{'decisiontreeclassifier__max_depth':depth}]

gs = GridSearchCV(estimator=tree_pipe, param_grid=param_grid, scoring='accuracy', cv=10)
gs = gs.fit(X, y)

In [None]:
high_score(gs)

In [None]:
gs.best_params_

In [None]:
SVC_pipe = make_pipeline(StandardScaler(), PCA(n_components=3), SVC(random_state=1))

param_range = [10.0, 100.0, 1000.0]

param_grid = [{'svc__C': param_range, 
               'svc__kernel': ['linear']},
              {'svc__C': param_range, 
               'svc__gamma': param_range, 
               'svc__kernel': ['rbf']}]

gs_svc = GridSearchCV(estimator=SVC_pipe, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)

In [None]:
gs_svc = gs_svc.fit(X, y)
print(f'{high_score(gs_svc)}')

In [None]:
gs.best_params_