In [None]:
import utils.data as ud
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

In [None]:
import tbtools.dev as tbdev
import tbtools.panda as tbpd

## Assign train, validate, test labels

In [None]:
from sklearn.cross_validation import train_test_split

import utils.features as uf

In [None]:
%%time
x,y = uf.get_x_y( c_lags=('10 min',
                          '20 min',
                          '30 min'),
                  dc_settings=(
                    ('10 min', '2 min'),
                    ('20 min', '2 min'),
                    ('30 min', '2 min'),
                    ('10 min', '5 min'),
                    ('20 min', '5 min'),
                    ('30 min', '5 min'),
                ))

In [None]:
fig = sns.plt.figure()
sns.plt.plot((x.index - x.index.min()).total_seconds()[::50])

In [None]:
x['C'] = y

_train, _test = train_test_split(x, test_size=0.2, random_state=0)
_train, _validate = train_test_split(_train, test_size=.25, random_state=1)

In [None]:
def xy(v):
    return v.drop('C', axis=1), v['C']

x_train, y_train = xy(_train)
x_validate, y_validate = xy(_validate)
x_test, y_test = xy(_test)

In [None]:
x_train.shape, x_validate.shape, x_test.shape

---

# Plot variables against target

In [None]:
tbpd.hist2d(x_train.iloc[:,0], y_train, integer_aligned_bins=True, square=True);
sns.plt.figure()
tbpd.hist2d(x_train.iloc[:,1], y_train, integer_aligned_bins=True, square=True);
sns.plt.figure()
tbpd.hist2d(x_train.iloc[:,2], y_train, integer_aligned_bins=True, square=True);


---

In [None]:
perfs = pd.DataFrame()

In [None]:
perfs.sort_values('rmse')

## Model validation

In [None]:
import sklearn.metrics as skmet

def score(model):
    pred = model.predict(x_validate)
    
    if isinstance(model, Pipeline):
        model = model.steps[-1][1]
    name = str(model)
    
    d = {
        'rmse': np.sqrt(skmet.mean_squared_error(y_validate, pred)),
        'mae': skmet.mean_absolute_error(y_validate, pred),
        'evs': skmet.explained_variance_score(y_validate, pred),
        'r2': skmet.r2_score(y_validate, pred),
    }
    s = pd.Series(d, name=name)
    
    global perfs
    perfs.append(s)
    
    return s

In [None]:
def plot_residuals(model):
    res = y_train - model.predict(x_train)
    n = x_train.shape[1]
    for i in range(n):
        sns.plt.figure(figsize=(6,5))
        x = x_train.iloc[:,i]
        tbpd.hist2d(x, res, 
                    bins=((max(x) - min(x)), 20),
                    vlabel=x_train.columns[i],
                    hlabel='Residuals',
                   )

In [None]:
import tbtools.dev as tbdev
import tbtools.panda as tbpd
tbpd = tbdev.reload(tbpd)

Consider QQ plot

In [None]:
plot_residuals(p)

In [None]:
from sklearn.preprocessing import 

In [None]:
MinMaxScaler??

In [None]:
class FeaturePicker:
    def __init__(self, keep):
        self.keep = keep
        
    def transform(self, x):
        return x[self.keep]
    
    def fit(self, *args):
        # do nothing
        return self
    

# Models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

## Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
mms = MinMaxScaler()

In [None]:
p = Pipeline( (('MinMaxScaler', mms), ('LinearRegression', lr)) )
p.fit(x_train, y_train)
score(p)

In [None]:
name = x_train.columns[0]
a = x_train[name].resample('H').mean()
b = y_train.resample('H').mean()

sns.plt.figure(figsize=(15,8))
sns.plt.scatter(a, b)
sns.plt.ylabel('y')

In [None]:
name = x_train.columns[1]
a = x_train[name].resample('H').mean()
b = y_train.resample('H').mean()

sns.plt.figure(figsize=(15,8))
sns.plt.scatter(a, b)
sns.plt.ylabel('y')
sns.plt.xlabel(name)

In [None]:
name = x_train.columns[2]
a = x_train[name].resample('H').mean()
b = y_train.resample('H').mean()

sns.plt.figure(figsize=(15,8))
sns.plt.scatter(a, b)
sns.plt.ylabel('y')
sns.plt.xlabel(name)

In [None]:
sns.plt.figure(figsize=(15,8))
y_train.resample('H').mean().plot(linewidth=0.5)
sns.plt.ylabel('Reinspection count')

In [None]:
sns.plt.figure(figsize=(15,8))
name = x_train.columns[0]
x_train[name].resample('H').mean().plot(linewidth=0.5)
sns.plt.ylabel(name)

In [None]:
p = Pipeline((
        ('Only 5 min', FeaturePicker(['ΔC L=5 min W=2 min', 'C L=5 min'])), 
        ('MinMaxScaler', mms), 
        ('LinearRegression', lr)) )
p.fit(x_train, y_train)
score(p)

## Elastic net

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
en = ElasticNetCV(l1_ratio=(0.1, 0.3, 0.5, 0.7, 0.8, 0.9, 0.95, 0.99))
mms = MinMaxScaler()
p = Pipeline( (('MinMaxScaler', mms), ('ElasticNetCV', en)) )
p.fit(x_train, y_train)
score(p)

## Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
DecisionTreeRegressor?

In [None]:
p = Pipeline((
#         ('Only 5 min', FeaturePicker(['ΔC L=5 min W=2 min', 'C L=5 min'])), 
#         ('MinMaxScaler', mms), 
        ('DT', DecisionTreeRegressor()),
    ))
p.fit(x_train, y_train)
score(p)

## RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
p = Pipeline((
#         ('Only 5 min', FeaturePicker(['ΔC L=5 min W=2 min', 'C L=5 min'])), 
#         ('MinMaxScaler', mms), 
        ('RF', RandomForestRegressor()),
    ))
p.fit(x_train, y_train)
score(p)

In [None]:
x_train.columns

Sure looks like the most important feature is C 5 minutes ago...

In [None]:
en.coef_

In [None]:
rf = p.steps[-1][1]
rf.feature_importances_

---

## Straight analysis stuff

The following figure depicts the autocorrelation for C, excluding nights and weekends. It is not impressive.

In [None]:
sns.plt.figure(figsize=(15,8))
res = c.resample('H').mean().fillna(0)
res = res[res.index.dayofweek < 5]
res = res[res.index.hour > 5]
res = res[res.index.hour < 20]
pd.tools.plotting.autocorrelation_plot(
# c[(c.index.hour>6) & (c.index.hour<20) & (c.index.dayofweek < 5)].resample('B').mean().fillna(0).values
    res.values
)
sns.plt.vlines(np.arange(0,sns.plt.xlim()[1], 24), -1, 1, color='w', alpha=0.75)
sns.plt.vlines(np.arange(0,sns.plt.xlim()[1], 5*24), -1, 1, color='k', alpha=0.75)
