In [None]:
import utils.data as ud
import tbtools.dev as tbdev
import tbtools.panda as tbpd
import tbtools.iter as tbiter

import numpy as np
import pandas as pd

import statsmodels.api as sm

import seaborn as sns
%matplotlib inline

In [None]:
import utils.evaluation.modelling.statsmodels_linreg as slr
import utils.evaluation.modelling.fit as uemf

In [None]:
train, val, test = ud.design_matrices.get_by_settings(
                        sample_step='2 min', split='all', 
                        line=2,
                        lag='5 min', dcwindow='2 min', 
                        rn=10, boawindow='2 min',
                        add_intercept=True)

#### Small experiment with data transformation to 0;1 range. 

Do not use. It gives worse results.

In [None]:
def get_01_transform(data):
    ma = data.max(axis=0)
    mi = data.min(axis=0)
    
    sub = mi
    div = ma - mi
    div[div==0] = 1
    
    def transform(x):
        return (x-sub[x.columns])/div[x.columns]

    return transform

In [None]:
x_transform = get_01_transform(train)

## Extract knowledge

#### How many percent of reinspection entries are handled by the main entrances?

In [None]:
bbh = ud.enhanced.get('bbh')

In [None]:
bbh[bbh.reinspection_change == 1].Tx.value_counts()

In [None]:
n=22165+22151
n/(853+374+n)

#### Generate table of top-10 alarms

In [None]:
alm = ud.raw.get('almhist')

In [None]:
s = alm.AlmNr.value_counts().head(10)

s.name = 'Count'
s = s.to_frame()
s.index.name = 'AlmNr'
print(s.to_latex())

#### Plot alarm type count distribution

In [None]:
import utils.plotting as up

In [None]:
ax = alm.AlmNr.value_counts().reset_index().plot(y='AlmNr', marker='.', linewidth=.75)
ax.set_xlabel('index')
ax.set_ylabel('Occurrence count')
up.save_fig('w19/almcount_powerlaw.png', target='week')

## Code: reduce

In [None]:
uemf = tbdev.reload(uemf)

In [None]:
results = uemf.ResultsAggregator({'train':train, 'val':val, 'test':test},
                                 target_name='y',
                                 prediction_modifier=np.square)

def reduce(subset, name, plot_fit=True):
    model, feats = uemf.smlr_fit_reduce(subset, train)
    results.append(name, model=model, feature_subset=feats)
    if plot_fit:
        results.plot_residuals(name)

## Most sensemaking model

Start reduction only with alarms that occur at least once.

In [None]:
with tbdev.Notify():
    subset = train.columns[(train != 0).sum() > 0]
    subset = [x for x in subset if x != 'y']
    reduce(subset, 'L2 final')

In [None]:
resid = results.results['L2 final']['residuals']['train']

pd.Series(resid).hist(bins=36)
sm.qqplot(resid, line='q');

In [None]:
results.to_df().round(2)

In [None]:
results.results['L2 final']['model'].summary()

In [None]:
results.results['L2 final']['subset']