In [None]:
import datetime as dt

import numpy as np
import pandas as pd

from river import metrics
from river import utils
from river.proba import Gaussian
from scipy.stats import kstest

from functions.anomaly import GaussianScorer
from functions.proba import MultivariateGaussian

from functions.plot import plot_limits_3d, plot_limits_grid

In [None]:
df = pd.read_csv("data/data_BESS.csv", index_col=0).set_index("time").head(40000).tail(30000)
df.index = pd.to_datetime(df.index, utc=True)
df['Power Setpoint Deviation'] = df['String Power'] - df['Active Power Setpoint']
df = df[["Avg. Cell Temperature", "Max. Cell Temperatue", "Power Setpoint Deviation", "Min. Cell Temperature"]]

In [None]:
# CONSTANTS
days = 4
WINDOW = dt.timedelta(hours=24*days)
minutes = int(WINDOW.total_seconds()/60)
GRACE_PERIOD=minutes*3/4
LOG_THRESHOLD = -25
THRESHOLD = 0.99735

model = GaussianScorer(
    utils.TimeRolling(MultivariateGaussian(), period=WINDOW),
    #utils.Rolling(MultivariateGaussian(), window_size=24*days*60),
    grace_period=GRACE_PERIOD,
    threshold=THRESHOLD,
    log_threshold=LOG_THRESHOLD
    )


sampling_model = GaussianScorer(Gaussian(), grace_period=0)

system_anomaly = []
signal_anomaly = []
sampling_anomaly = []
change_point = []
list_thresh_pos = []
list_thresh_neg = []
mus = []
samples = []
scores = []
logcdf = []
d = []

for i, (t, x) in enumerate(df.iterrows()):
    t = t.tz_localize(None)
    x = x.to_dict()
    
    # Check anomaly in system
    is_anomaly = model.predict_log_one(x); system_anomaly.append(is_anomaly)
    
    # Get signal thresholds
    thresh_high, thresh_low = model.limit_one()
    #thresh_high = thresh_high if thresh_high < 1 else 1
    list_thresh_pos.append(thresh_high)
    #thresh_low = thresh_low if thresh_low > 0 else 0
    list_thresh_neg.append(thresh_low)
    
    # Check anomaly in signals
    if i != 0:
        highs = [thresh_high[i][i] for i in range(len(thresh_high))]
        lows = [thresh_low[i][i] for i in range(len(thresh_low))]
        signal_anomaly.append({k: not ((lows[i] < v) and (v < highs[i])) 
                            for i, (k, v) in enumerate(x.items())})
    else:
        signal_anomaly.append({k: 0 for i, (k, v) in enumerate(x.items())})
    
    # Check anomaly in sampling
    if i != 0:
        score_ = sampling_model.score_one((t-t_prev).seconds)
        if model.gaussian.n_samples > model.grace_period:
            sample_a = 1 if ((1-THRESHOLD) > score_) or (score_ > THRESHOLD) else 0
        else: 
            sample_a = 0
        if sample_a:
            sampling_anomaly.append(1)
            w = 1-score_
        else:
            sampling_anomaly.append(0)
            w = 1
        sampling_model.learn_one((t-t_prev).seconds, w=w)
    else:
        sampling_anomaly.append(0)
    t_prev = t
    
    mus.append(model.gaussian.mu)
    samples.append(model.gaussian.n_samples)
    if i != 0:
        d.append(sum(system_anomaly[-int(minutes/days):-1]) / 
                    len(system_anomaly[-int(minutes/days):-1]))
        is_change = (sum(system_anomaly[-int(minutes/days):-1]) / 
                    len(system_anomaly[-int(minutes/days):-1]) > 0.12)
    else:
        is_change = 0
    change_point.append(is_change)
    
    if not is_anomaly or is_change:
        model = model.learn_one(x, **{'t': t})

df_out = pd.DataFrame({"level_high": list_thresh_pos,
                       "level_low": list_thresh_neg,
                       "anomaly": system_anomaly,
                       "signal_anomaly": signal_anomaly,
                       "sampling_anomaly": sampling_anomaly,
                       "change_point": change_point},
                      index= df.index)

In [None]:
text = (f"Sliding window: {WINDOW}\n"
        f"Proportion of anomalous samples: "
        f"{sum(system_anomaly)/len(system_anomaly)*100:.02f}%\n"
        f"Total number of anomalous events: "
        f"{sum(pd.Series(system_anomaly).diff().dropna() == 1)}\n"
        f"Total number of change points: "
        f"{sum(change_point)}\n"
        f"Total number of sampling anomalies: "
        f"{sum(sampling_anomaly)}")

print(text)

In [None]:
plot_limits_grid(df, df_out.anomaly, 
               ser_high=df_out.iloc[1:, :]['level_high'],
               ser_low=df_out.iloc[1:, :]['level_low'],
               changepoints=df_out.change_point, 
               samplings=df_out.sampling_anomaly,
               save=True)

In [None]:
plot_limits_3d(df, df_out.anomaly, 
               ser_high=df_out.iloc[1:, :]['level_high'],
               ser_low=df_out.iloc[1:, :]['level_low'],
               signal_anomalies=df_out.signal_anomaly,
               y="Power Setpoint Deviation",
               z="Avg. Cell Temperature",
               save=False)

# Kolmogorov-Smirnov Tests of Normality

In [None]:
s_mean = pd.Series(mus, index=df.index)
s_mean.apply(lambda x: float('nan') if len(x) == 0 else x[0])
residuals = (df['Avg. Cell Temperature'] - s_mean.apply(lambda x: float('nan') if len(x) == 0 else x[0])).dropna()
# TODO: test various
print(f"Does the normality assumtion hold true: "
      f"{kstest((residuals - residuals.mean()) / residuals.std(), 'norm')}")
residuals.plot.kde()

# Jupyter Dash

In [None]:
# Jupyter Dash
import plotly.express as px
from dash import dcc, html
from dash.dependencies import Input, Output, State
from jupyter_dash import JupyterDash

# Build App
app = JupyterDash(__name__)

app.layout = html.Div([
  html.H1("JupyterDashboard"),
  html.Label([
    "y",
    dcc.Dropdown(
        id='y-dropdown',
        value=df.columns[1], options=[
            {'label': c, 'value': c}
            for c in df.columns
        ],
    ),
  ]),
  html.Label([
    "z",
    dcc.Dropdown(
        id='z-dropdown',
        value=df.columns[2], options=[
            {'label': c, 'value': c}
            for c in df.columns
        ],
    )
  ]),
  dcc.Loading(
    dcc.Graph(id='graph')
  ),
])

# Define callback to update graph
@app.callback(
  Output('graph', 'figure'),
  [Input("y-dropdown", "value"),
  Input("z-dropdown", "value")]
)
def update_figure(y, z):
  return plot_limits_3d(df, df_out.anomaly, 
               ser_high=df_out.iloc[1:, :]['level_high'],
               ser_low=df_out.iloc[1:, :]['level_low'],
               y=y,
               z=z)


# Run app and display result inline in the notebook
app.run_server(mode='inline')