In [None]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline


import tbtools.dev as tbdev

import utils.prediction as upr
import utils.data as ud
import utils.plotting as up

## Fetch data, clean it up

In [None]:
results = upr.PersistentResultsAggregator(path=os.path.join(ud.paths.Paths.cache, 'v3results'))

In [None]:
df = results.to_df()

In [None]:
def extract_int(df, tomatch):
    return df.index.str.findall('{}(\d+)'.format(tomatch)).map(lambda x: x[0]).astype(int)

def extract_str(df, tomatch):
    return df.index.str.findall('({})'.format(tomatch)).map(lambda x: x[0]).astype(str)

In [None]:
df['model'] = extract_str(df, '^\w\w')
df['line'] = extract_int(df, 'Line')
df['lag'] = extract_int(df, 'Lag=')
df = df.reset_index().drop( ('index', ''), axis=1)

## Use simple comparisons to find which model/baseline is better for what lag

#### RMSE

:/

In [None]:
r = df[[('rmse', 'train_cv_mean'), ('rmse', 'train_cv_sem'), ('model', ''), ('line', ''), ('lag', '')]]
for k, gr in r.groupby(['line', 'lag']):
    gr = gr.set_index('model')
    gr['lo'] = gr[('rmse', 'train_cv_mean')] - 1.96*gr[('rmse', 'train_cv_sem')]
    gr['hi'] = gr[('rmse', 'train_cv_mean')] + 1.96*gr[('rmse', 'train_cv_sem')]
    best = gr[('rmse', 'train_cv_mean')].argmin()
    contenders = gr.index[(gr['lo'] < gr['hi'][best])]
    print(k)
    for i in contenders:
        print('\t', i, '{:.2} {:.2} {:.2}'.format(gr['lo'][i], 
                                                  gr[('rmse', 'train_cv_mean')][i], 
                                                  gr['hi'][i]))
        


In [None]:
r = df[[('within_80%', 'train_cv_mean'), ('within_80%', 'train_cv_sem'), ('model', ''), ('line', ''), ('lag', '')]]
for k, gr in r.groupby(['line', 'lag']):
    gr = gr.set_index('model')
    gr['lo'] = gr[('within_80%', 'train_cv_mean')] - 1.96*gr[('within_80%', 'train_cv_sem')]
    gr['hi'] = gr[('within_80%', 'train_cv_mean')] + 1.96*gr[('within_80%', 'train_cv_sem')]
    best = gr[('within_80%', 'train_cv_mean')].argmin()
    contenders = gr.index[(gr['lo'] < gr['hi'][best])]
    print(k)
    for i in contenders:
        print('\t', i, '{:.2} {:.2} {:.2}'.format(gr['lo'][i], 
                                                  gr[('within_80%', 'train_cv_mean')][i], 
                                                  gr['hi'][i]))
        


## Train CV Standard error of the mean plots

#### code for semplot

In [None]:
def semplot(line, vals, figsize=(6,6), save=True):
    col = iter(sns.color_palette('colorblind'))
    ax = None
    jit = -.75
    for k, gr in df[df.line == line].groupby(['model']):
        c = next(col)
        gr = gr.copy()
        gr['lag'] += jit
        with sns.axes_style('whitegrid'):
            ax = gr.sort_values('lag').plot('lag', (vals, 'train_cv_mean'), marker='o',
                                       label=k, color=c, ax=ax, figsize=figsize)
            sns.plt.vlines(x=df['lag']+jit,
                ymin=gr[(vals, 'train_cv_mean')] - 1.96*gr[(vals, 'train_cv_sem')],
                ymax=gr[(vals, 'train_cv_mean')] + 1.96*gr[(vals, 'train_cv_sem')], 
                           color=c, hold=True)
        jit += .5
    sns.plt.ylim((np.floor( (df[(vals, 'train_cv_mean')] - 1.96*df[(vals, 'train_cv_sem')]).min() ),
                  np.ceil( (df[(vals, 'train_cv_mean')] + 1.96*df[(vals, 'train_cv_sem')]).max() )))
    sns.plt.xlim((-2, 64))
    sns.plt.legend(loc='upper left')
    name = 'RMSE' if vals=='rmse' else '80th percentile'
    sns.plt.ylabel('{} Line {}'.format(name, line))
#     up.lim_expand(ax, 1.1)
    ax.set_xticks(df.lag.unique())
    if save:
        # quick fix:
        if '%' in vals and '80' in vals:
            vals = 'w80'
        p = 'results/{}_sem_l{}.png'.format(vals, line)
        up.save_fig('w21/' + p, target='week')
        up.save_fig(p)

#### rmse

In [None]:
semplot(1, 'rmse')

In [None]:
semplot(2, 'rmse')

#### 80th

In [None]:
semplot(1, 'within_80%')

In [None]:
semplot(2, 'within_80%')

## traincv/train/val plots

#### code

In [None]:
df2 = df.stack().bfill().reset_index(level=1)
df2 = df2.rename(columns={'level_1': 'split'})
df2 = df2[df2.split != '']
df2 = df2[df2.split != 'train_cv_sem']

df2 = df2[df2.split.isin(('train','val', 'train_cv_mean'))]

df2['model'] = df2.model.map({'LR':'Linear regression', 
                             'RF':'Random forest', 
                             'BM':'Baseline: Mean', 
                             'BC': 'Baseline: Last C'}.__getitem__)

In [None]:
def plotme(metric):
    
    with sns.plotting_context("notebook", font_scale=1.25):
        fg = sns.factorplot(x='lag', y=metric, hue='model',
                       col='split', row='line',
                       data=df2, palette='colorblind',
                       margin_titles=True, ylim=(0,None), scale=.5,
                       legend_out=True)

        jitscale = 12

        for ax, line in zip((fg.facet_axis(0,1), fg.facet_axis(1,1)), (1,2)):
            jit = -2/jitscale + 1/(2*jitscale)
            colors = iter(sns.color_palette('colorblind'))
            for model, gr in df[df.line==line].groupby('model'):
                c = next(colors)
                gr = gr.sort_values('lag')
                sem = gr[(metric, 'train_cv_sem')]
                mea = gr[(metric, 'train_cv_mean')]
                mi = mea - 1.96*sem
                ma = mea + 1.96*sem

                ax.vlines(x=np.arange(len(gr)) + jit,
                            ymin=mi,
                            ymax=ma,
                             color=c)
                jit += 1/jitscale
    

#### rmse

In [None]:
plotme('rmse')
up.save_fig('w21/results/rmse_train_cv_val.png', target='week')
up.save_fig('results/rmse_train_cv_val.png')

#### 80th


In [None]:
plotme('within_80%')

up.save_fig('w21/results/w80_train_cv_val.png', target='week')
up.save_fig('results/w80_train_cv_val.png')

## test/special plots

#### code

In [None]:
df3 = df.stack().bfill().reset_index(level=1)
df3 = df3.rename(columns={'level_1': 'split'})
df3 = df3[df3.split != '']
df3 = df3[df3.split != 'train_cv_sem']

df3 = df3[df3.split.isin(('test','special'))]

df3['model'] = df3.model.map({'LR':'Linear regression', 
                             'RF':'Random forest', 
                             'BM':'Baseline: Mean', 
                             'BC': 'Baseline: Last C'}.__getitem__)

In [None]:
def plotme2(metric):
    with sns.plotting_context("notebook", font_scale=1.25):

        fg = sns.factorplot(x='lag', y=metric, hue='model',
                       col='split', row='line',
                       data=df3, palette='colorblind',
                       margin_titles=True, ylim=(0,None), scale=.5)
        fg.set(ylim=(0,None))


#### rmse and w80

In [None]:
plotme2('rmse')
up.save_fig('w21/results/rmse_test_special.png', target='week')
up.save_fig('results/rmse_test_special.png')

In [None]:
plotme2('within_80%')
up.save_fig('w21/results/w80_test_special.png', target='week')
up.save_fig('results/w80_test_special.png')

# What did the models include??

oh, I didn't save the models.

In [None]:
results.results['RF Line1 Lag=10 min']

# What do the residuals look like?

In [None]:
keys = pd.Series(list(results.results.keys()))
dd = pd.DataFrame({'key':keys,
              'model':keys.str.findall('^(\w\w)').map(lambda x: x[0]),
              'line':keys.str.findall('Line([12])').map(lambda x: x[0]).astype(int),
              'lag':keys.str.findall('Lag=(\d+) min').map(lambda x: x[0]).astype(int)})

In [None]:
from scipy import stats

In [None]:
res = {}
for model in dd.model:
    res[model] = {}
    for line in (1,2):
        norm = np.zeros((dd.lag.nunique(), 4), dtype=float)
        for i, lag in enumerate(dd.lag.unique()):
            rr = results.results['{} Line{} Lag={} min'.format(model, line, lag)]['residuals']
            for j, split in enumerate(rr.keys()):
                norm[i,j] = stats.shapiro(rr[split].values)[1]
        res[model][line] = norm

In [None]:
pp = {}
for line in (1,2):
    pan = {}
    for model in res.keys():
        ff = pd.DataFrame(res[model][line])
        ff.columns = list(rr.keys())
        ff.columns.name = 'split'
        ff.index = dd.lag.unique()
        ff.index.name = 'lag'
        ff = ff.sort_index()
        pan[model] = ff
    pp[line] = pan
pp = pd.Panel4D(pp)
pp

In [None]:
def _plotwithstd(ax, xlabel, ylabel, x, y, zeroline=True):
    s = (pd.Series(y.values, index=x).sort_index()
         .rolling(len(y)//10, center=True)) # len(y)//10
    mean = s.mean().dropna()
    std = (s.std().dropna()
           .rolling(10, center=True, min_periods=3)
           .mean().dropna())

    ax.scatter(x, y, alpha=.1, marker=',')
    ax.plot(mean.index.values, mean.values, color='r', label='mean')
    ax.plot(std.index.values, std.values, color='k', linewidth=3.5)
    ax.plot(std.index.values, std.values, color='w', label='std', linewidth=2)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.grid(True, color='#DDDDDD')
    ax.set_ylim((-15, 20))

def plotstuff(model, line, 
              splits=('train', 'val', 'test', 'special'),
              lags = (5,10,15,20,30,45,60),
              figsize=(16, (4+4)*16)):
    with tbdev.Notify('Yipeee!'):

        plotpermodel = 4+4
        targetcolor = 'r'
        stdcolor = 'm'
        
        with sns.axes_style('white'):
            fig, axs = sns.plt.subplots(len(lags)*plotpermodel,len(splits), figsize=figsize)
        # rows
        dg = dd[(dd.model==model) & (dd.line==line) & (dd.lag.isin(lags))].sort_values('lag')
        for i, (k, lag) in enumerate(zip(dg.key, dg.lag)):                
            rr = results.results[k]['residuals']
            # columns: splits
            i*=plotpermodel
            dms = ud.design_matrices.get_by_settings('2 min', line, lag, '2 min', 10, '2 min')
            for j, split in enumerate(splits):
                dt = rr[split]
                resid = dt
                pred = dms[split]['y'] - dt 
                ax = dt.hist(ax=axs[i,j], normed=True, range=(-15,15), bins=30)
                sm.qqplot(dt, ax=axs[i+1,j], line='q')        
                ax.set_xlabel('')
                ax.set_ylabel('')
                ax.set_title('{}{} | split = {} | lag = {} | n={}'.format(
                        model, line, split, lag, len(dt)))
                ax = axs[i+1,j]
                ax.set_xlabel('')
                ax.set_ylabel('')
                
                ax = axs[i+2,j]
                ax.acorr(resid, usevlines=True, maxlags=10+(lag-5)*.7)
                
                with sns.axes_style('whitegrid'):
                                
                    _plotwithstd(axs[i+3,j], 'Predicted value', 'Residuals', pred, dt)

                    h = (dt.index.values - dt.index.to_series().dt.floor('D')).dt.total_seconds() / 3600
                    _plotwithstd(axs[i+4,j], 'Hours since midnight', 'Residuals', h, dt)


                    x = dms[split]['x']
                    ccol = [c for c in x if c.startswith('C ')][0]
                    dccol = [c for c in x if c.startswith('Δ')][0]
                    rcol = [c for c in x if c.startswith('R ')][0]
                    for m, col in enumerate((ccol, dccol, rcol)):
                        _plotwithstd(axs[i+5+m,j], col, 'Residuals', x[col], resid)                       


        fig.tight_layout()

### LR

#### line 1

In [None]:
plotstuff('LR', 1)

#### line 2

In [None]:
plotstuff('LR', 2)

### RF

#### line 1

In [None]:
plotstuff('RF', 1)

#### line 2

In [None]:
plotstuff('RF', 2)

### BM

#### Line 1

In [None]:
plotstuff('BM', 1)

###  For report

In [None]:
plotstuff('LR', 1, splits=('train', 'val'), lags=(45,),
          figsize=(8,12.5))

up.save_fig('results/linregeval.png', pad=.5)

## Try to build a hist/qqplot for each model, across all the data

In [None]:
models = ('LR', 'RF', 'BC', 'BM')
splits = (
#             'train', 
            'val',
            'test',
            'special',
)

with sns.axes_style('white'):
    fig, axs = sns.plt.subplots(4,2, figsize=(8,10))#, sharex=True)#, sharey=True)

rr = results.results

for i, model in enumerate(models):
    keys = [x for x in rr.keys() if model == x[:2]]
    dat = pd.concat([pd.Series(rr[k]['residuals'][split]) for k in keys for split in splits])
    ax = axs[i,0]
    dat.hist(ax=ax, range=(-15,15), bins=30)
    sm.qqplot(dat, ax=axs[i,1], line='q')
    ax.set_title('model = {} (splits {})'.format(model, splits))

fig.tight_layout()
                
# fig.text(0.5, 0.04, 'Theoretical quantiles', ha='center', va='center')
# fig.text(0.06, 0.5, 'Fraction', ha='center', va='center', rotation='vertical')


# 