In [None]:
import pandas as pd
import maptlotlib.pyplot as plt
import eif as iso # enhanced isolation tree library

In [None]:
sel_data = pd.read_feather("birdie.feather")

In [None]:
feature_columns = [prefix+str(i)
                  for prefix in ["Temp", "SM"] for i in range(1,7)]

In [None]:
sel_data["all_ok"] = np.all(np.stack(
    [(sel_data[prefix+f"{i}_label"] == "OK")
    for prefix in ["Temp", "SM"] for i in range(1,7)],
    axis=1), axis=1)

How many points of the selected are ok?

In [None]:
sel_data["all_ok"].value_counts()

In [None]:
feature_matrix = sel_data[feature_columns].values
hasnan = np.any(np.isnan(feature_matrix), axis=1) # NaNs will be excluded for most cases

## Isolation forests

In [None]:
forest = iso.iForest(feature_matrix[~hasnan, :], ntrees=20, sample_size=256, ExtensionLevel=2)
paths = forest.compute_paths(X_in=feature_matrix[~hasnan, :])

In [None]:
plt.figure()
plt.hist(paths, bins=200);
plt.xlabel("outlier score")

## Ploting with datashader

In [None]:
import holoviews as hv
import datashader as ds
import holoviews.operation.datashader as hd
from holoviews.operation.datashader import datashade

In [None]:
sel_viz = sel_data.iloc[~hasnan, :]
sel_viz.loc[:, "outlier"] = paths
sel_viz = sel_viz.sort_values(["box", "datetime"])
sel_viz["outlier_cat"] = np.digitize(sel_viz["outlier"], np.linspace(0.33, 0.45, 4))

In [None]:
hv.extension("bokeh")
hv.output(backend="bokeh")

Timeseries with outliers labeled

In [None]:
hv.output(backend="bokeh")
lines = hv.Curve(sel_viz[sel_viz.box==39],
                  kdims=['datetime','SM2'], vdims=["box", "outlier_cat"]).opts(color="outlier_cat")
hd.datashade(lines, aggregator=ds.count_cat('outlier_cat')).opts(height=500,width=800)

## ROC

In [None]:
n_samples = 200
sensitivities = np.zeros(n_samples)
specificities = np.zeros(n_samples)
is_ok_data = sel_data.loc[~hasnan, "all_ok"]
for i, threshold in enumerate(np.linspace(np.min(paths), np.max(paths), n_samples)):
    n_true_positivies = np.sum((paths > threshold) & (is_ok_data == False))
    n_true_negatives = np.sum((paths < threshold) & (is_ok_data == True))
    n_false_positivies = np.sum((paths > threshold) & (is_ok_data == True))
    n_false_negatives = np.sum((paths < threshold) & (is_ok_data == False))
    sensitivities[i] = n_true_positivies / (n_true_positivies + n_false_negatives)
    specificities[i] = n_true_negatives / (n_true_negatives + n_false_positivies)

In [None]:
plt.figure()
plt.plot(sensitivities, specificities)
plt.plot([1,0],[0,1], color="gray", lw=0.5)
plt.xlabel("sensitivity")
plt.ylabel("specificity")

# PCs

In [None]:
from sklearn.decomposition import PCA

In [None]:
pcs = PCA(2)

In [None]:
threshold = 0.4
sms = pcs.fit(feature_matrix[~hasnan, :][paths<threshold, :])

In [None]:
transformed = pd.DataFrame(pcs.transform(feature_matrix[~hasnan, :]), columns=["PC1", "PC2"])
transformed["is_outlier"] = paths<threshold
transformed["is_all_ok"] = sel_data["all_ok"].loc[~hasnan]
transformed["combined"] = transformed["is_all_ok"]*1 + transformed["is_outlier"]*2

In [None]:
green = (10, 200,90)
red = (230,30,0)
blue = (0,120,250)
hv.output(backend="bokeh")
lines = hv.Points(transformed,
                  kdims=['PC1','PC2'], vdims=["is_outlier", "is_all_ok", "combined"])
hd.datashade(lines, aggregator=ds.count_cat("combined"),
             color_key={0:red, 1:green, 2:green, 3:blue}).opts(height=500,width=800)