Skip to content

Commit

Permalink
Added reporting and metrics to the decoupled and easy to execute via …
Browse files Browse the repository at this point in the history
…cli approach.
  • Loading branch information
federicozaiter committed Oct 31, 2019
1 parent b3d8a13 commit a8e17e0
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 49 deletions.
6 changes: 2 additions & 4 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
__all__ = [
"utils",
"logclass",
]
__all__ = ["utils", "logclass"]

from .preprocess import *
from .feature_engineering import *
from .models import *
from .reporting import *
2 changes: 1 addition & 1 deletion feature_engineering/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def binary_train_gtruth(y):
return np.where(y == -1.0, -1.0, 1.0)


def multi_class_gtruth(x, y):
def multi_features(x, y):
anomalous = (y != -1)
x_multi, y_multi =\
x[anomalous], y[anomalous]
Expand Down
74 changes: 30 additions & 44 deletions logclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,26 @@
import shutil
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from .puLearning.puAdapter import PUAdapter
from sklearn import metrics
from sklearn.metrics import f1_score
from .feature_engineering.vectorizer import (
build_vocabulary,
log_to_vector,
)
from .utils import TestingParameters, save_params, load_params
import pickle
from .preprocess import registry as preprocess_registry
from .preprocess.utils import load_logs
from .feature_engineering import registry as feature_registry
from .feature_engineering.utils import (
save_vocabulary,
load_vocabulary,
binary_train_gtruth,
multi_class_gtruth,
multi_features,
)
from tqdm import tqdm
import time
from .models import binary_registry as binary_classifier_registry
from .models import multi_registry as multi_classifier_registry
from .reporting import bb_registry as black_box_report_registry
from .reporting import wb_registry as white_box_report_registry


def init_flags():
Expand Down Expand Up @@ -172,29 +169,6 @@ def print_params(params):
print("-" * 80)


def get_feature_names(vocabulary, add_length=True):
feature_names = zip(vocabulary.keys(), vocabulary.values())
feature_names = sorted(feature_names, key=lambda x: x[1])
feature_names = [x[0] for x in feature_names]
if add_length:
feature_names.append('LENGTH')
return np.array(feature_names)


def get_top_k_SVM_features(svm_clf: LinearSVC, vocabulary,
target_names, top_features=3):
top_k_label = {}
feature_names = get_feature_names(vocabulary)
for i, label in enumerate(target_names):
if len(target_names) < 3 and i == 1:
break # coef is unidemensional when there's only two labels
coef = svm_clf.coef_[i]
top_coefficients = np.argsort(coef)[-top_features:]
top_k_features = feature_names[top_coefficients]
top_k_label[label] = list(reversed(top_k_features))
return top_k_label


def file_handling(params):
if params['train']:
if os.path.exists(params["base_dir"]) and not params["force"]:
Expand Down Expand Up @@ -258,18 +232,20 @@ def inference(params, x_data, y_data):
binary_clf.load()
# Anomaly detection
y_pred_pu = binary_clf.predict(x_test)
pu_f1_score = f1_score(y_test, y_pred_pu)
get_accuracy = black_box_report_registry.get_bb_report('acc')
binary_acc = get_accuracy(y_test, y_pred_pu)
# MultiClass remove healthy logs
x_infer_multi, y_infer_multi = multi_class_gtruth(x_test, y_data)
x_infer_multi, y_infer_multi = multi_features(x_test, y_data)
# Load MultiClass
multi_classifier_getter =\
multi_classifier_registry.get_multi_model('svm')
multi_classifier = multi_classifier_getter(params)
multi_classifier.load()
# Anomaly Classification
pred = multi_classifier.predict(x_infer_multi)
score = metrics.accuracy_score(y_infer_multi, pred)
print(pu_f1_score, score)
get_multi_acc = black_box_report_registry.get_bb_report('multi_acc')
score = get_multi_acc(y_infer_multi, pred)
print(binary_acc, score)


def train(params, x_data, y_data, target_names):
Expand All @@ -292,34 +268,44 @@ def train(params, x_data, y_data, target_names):
binary_clf = binary_clf_getter(params)
binary_clf.fit(x_train, y_train_pu)
y_pred_pu = binary_clf.predict(x_test)
pu_f1_score = f1_score(y_test_pu, y_pred_pu)
get_accuracy = black_box_report_registry.get_bb_report('acc')
binary_acc = get_accuracy(y_test_pu, y_pred_pu)
# Multi-class training features
x_train_multi, y_train_multi =\
multi_class_gtruth(x_train, y_train)
x_test_multi, y_test_multi = multi_class_gtruth(x_test, y_test)
multi_features(x_train, y_train)
x_test_multi, y_test_multi = multi_features(x_test, y_test)
# MultiClass
multi_classifier_getter =\
multi_classifier_registry.get_multi_model('svm')
multi_classifier = multi_classifier_getter(params)
multi_classifier.fit(x_train_multi, y_train_multi)
pred = multi_classifier.predict(x_test_multi)
score = metrics.accuracy_score(y_test_multi, pred)
get_multi_acc = black_box_report_registry.get_bb_report('multi_acc')
score = get_multi_acc(y_test_multi, pred)
better_results = (
pu_f1_score > best_pu_fs
or (pu_f1_score == best_pu_fs and score > best_multi)
binary_acc > best_pu_fs
or (binary_acc == best_pu_fs and score > best_multi)
)
# TODO: FOR LOOP WITH ALL THE REPORTING & METRICS EVALUATION
# Both fro black and white box reporting
if better_results:
if pu_f1_score > best_pu_fs:
best_pu_fs = pu_f1_score
if binary_acc > best_pu_fs:
best_pu_fs = binary_acc
save_params(params)
if score > best_multi:
best_multi = score
binary_clf.save()
multi_classifier.save()
print(pu_f1_score, score)
print(binary_acc, score)
if params['top10']:
print(get_top_k_SVM_features(
multi_classifier, vocabulary, target_names))
get_top_k = white_box_report_registry.get_wb_report('top_k_svm')
print(get_top_k(
params,
multi_classifier.model,
vocabulary,
target_names=target_names,
top_features=5
))


def main():
Expand Down
1 change: 1 addition & 0 deletions reporting/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__all__ = ["accuracy", "confusion_matrix", "multi_class_acc", "top_k_svm"]
7 changes: 7 additions & 0 deletions reporting/accuracy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .bb_registry import register
from sklearn.metrics import f1_score


@register('acc')
def model_accuracy(y, pred):
return f1_score(y, pred)
18 changes: 18 additions & 0 deletions reporting/bb_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Registry for black box reports or metrics."""

_BB_REPORTS = dict()


def register(name):
"""Registers a new black box report or metric function."""

def add_to_dict(func):
_BB_REPORTS[name] = func
return func

return add_to_dict


def get_bb_report(model):
"""Fetches the black box report or metric function."""
return _BB_REPORTS[model]
7 changes: 7 additions & 0 deletions reporting/confusion_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .bb_registry import register
from sklearn.metrics import confusion_matrix


@register('confusion_matrix')
def report(y, pred):
return confusion_matrix(y, pred)
7 changes: 7 additions & 0 deletions reporting/multi_class_acc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .bb_registry import register
from sklearn.metrics import accuracy_score


@register('multi_acc')
def model_accuracy(y, pred):
return accuracy_score(y, pred)
31 changes: 31 additions & 0 deletions reporting/top_k_svm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from .wb_registry import register
import numpy as np


def get_feature_names(params, vocabulary, add_length=True):
feature_names = zip(vocabulary.keys(), vocabulary.values())
feature_names = sorted(feature_names, key=lambda x: x[1])
feature_names = [x[0] for x in feature_names]
if 'length' in params['features']:
feature_names.append('LENGTH')
return np.array(feature_names)


@register('top_k_svm')
def get_top_k_SVM_features(params, model, vocabulary, **kwargs):
hparms = {
'target_names': [],
'top_features': 5,
}
hparms.update(kwargs)

top_k_label = {}
feature_names = get_feature_names(params, vocabulary)
for i, label in enumerate(hparms['target_names']):
if len(hparms['target_names']) < 3 and i == 1:
break # coef is unidemensional when there's only two labels
coef = model.coef_[i]
top_coefficients = np.argsort(coef)[-hparms['top_features']:]
top_k_features = feature_names[top_coefficients]
top_k_label[label] = list(reversed(top_k_features))
return top_k_label
18 changes: 18 additions & 0 deletions reporting/wb_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Registry for white box reports or metrics."""

_WB_REPORTS = dict()


def register(name):
"""Registers a new white box report or metric function."""

def add_to_dict(func):
_WB_REPORTS[name] = func
return func

return add_to_dict


def get_wb_report(model):
"""Fetches the white box report or metric function."""
return _WB_REPORTS[model]

0 comments on commit a8e17e0

Please sign in to comment.