In [None]:
import utils.data as ud
import tbtools.dev as tbdev
import tbtools.panda as tbpd
import tbtools.iter as tbiter

import numpy as np
import pandas as pd

import statsmodels.api as sm

import seaborn as sns
%matplotlib inline

In [None]:
import utils.evaluation.modelling.statsmodels_linreg as slr
import utils.evaluation.modelling.fit as uemf

In [None]:
train, val, test = ud.design_matrices.get_by_settings(
                        sample_step='2 min', split='all', 
                        line=1,
                        lag='5 min', dcwindow='2 min', 
                        rn=10, boawindow='2 min',
                        add_intercept=True)

### What happens if I prune features with very few nonzero elems?

In [None]:
df = []

for n_nonzeros in sorted((train != 0).sum().unique()):
#     print(n_nonzeros)
    x = train.drop('y', axis=1)
    x = x[x.columns[(x != 0).sum() >= n_nonzeros]]
#     numcols = [c for c in x if not c.startswith('alm')]
#     x[numcols] = x[numcols] - x[numcols].mean()
#     x[numcols] = x[numcols]/x[numcols].std()
    res = sm.OLS(np.sqrt(train['y']), x).fit()
    _rmse = np.sqrt(np.mean(np.square(train['y'] - np.square(res.predict(x)))))
    df.append(pd.Series({'n_nonzeros':n_nonzeros,
               'rsquared_adj':res.rsquared_adj,
               'sqrt_mse_total':np.sqrt(res.mse_total),
                'rmse':_rmse,
               'n_features':x.shape[1]}, name=n_nonzeros))
    
df = pd.concat(df, axis=1).T


In [None]:
df.plot(x='n_features', y='rmse', marker='.', linewidth=0)

In [None]:
df.plot.scatter(x='n_nonzeros', y='rmse')

In [None]:
df[df.n_nonzeros < 40].plot(x='n_nonzeros', y='rmse', marker='.', linewidth=0)
df[4000 < df.n_nonzeros].plot(x='n_nonzeros', y='rmse', marker='.', linewidth=0)

In [None]:
import tbtools.dev as tbdev
tbdev.notify()

### Code: Remove least significant term

In [None]:
results = uemf.ResultsAggregator({'train':train, 'val':val, 'test':test},
                                 target_name='y',
                                 prediction_modifier=np.square)

def reduce(subset, name, plot_fit=True):
    model, feats = uemf.smlr_fit_reduce(subset, train)
    results.append(name, model=model, feature_subset=feats)
    if plot_fit:
        results.plot_residuals(name)

### Predicting the mean

In [None]:
subset = ['intercept']
reduce(subset, 'only intercept')

#### Prediction is last $C$

In [None]:
results['predicting C L=5'] = {
    'RMSE test': slr.rmse(test['y'] - test['C L=5 min']),
    'RMSE train': slr.rmse(train['y'] - train['C L=5 min']),
    'RMSE val': slr.rmse(val['y'] - val['C L=5 min']),
    'columns': [],
    'model': None
}

### Without alarms

In [None]:
subset = [x for x in train.columns if x!='y' and x!='line' and not x.startswith('alm')]
reduce(subset, 'without alarms')

### With alarms

In [None]:
independents = [x for x in train.columns if x!='y' and x!='line']
smlr(independents, 'with alarms')

In [None]:
results['with alarms']

In [None]:
resid = train['y'] - np.square(m.predict(x))
slr.rmse(resid)

### With alarms, minus alarms occuring fewer than $n$ times in the training data

$n \in {1,10,25,50,75,100,1000}$

In [None]:
for n in (1,5,10,25,50,75,100,1000):
    print(n)
    subset = [x for x in train.columns if x!='y' and ((train[x] > 0).sum() >= n)]
    smlr(subset, 'with {:04}+ alarms'.format(n))

In [None]:
tbdev.notify()

# eval sm

We get the best performance when we do reduction on a set where all the alarms occur at least once.


In [None]:
df2 = pd.DataFrame(results).T.drop(['columns', 'model'], axis=1).sort_values('RMSE train')
df2

In [None]:
df2.plot()

# Most sensemaking model

Start reduction only with alarms that occur at least once.

In [None]:
subset = train.columns[(train != 0).sum() > 0]
subset = [x for x in subset if x != 'y']
reduce(subset, 'L1 final')

In [None]:
resid = results.results['L1 final']['residuals']['train']

pd.Series(resid).hist(bins=36)
sm.qqplot(resid, line='q');

In [None]:
results.to_df().round(2)

In [None]:
results.results['L1 final']['model'].summary()

In [None]:
results.results['L1 final']['subset']