In [None]:
import utils.data as ud
import seaborn as sns
%matplotlib inline

import pandas as pd
import numpy as np
import sklearn.ensemble as ske

import tbtools.dev as tbdev

import utils.plotting as up

import utils.evaluation.modelling.fit as mf

# Line 1


100 trees

    tr 0.92
    val 3.12
    te 2.39
    
80%

    tr 1.06
    val 3.86
    te 2.89
    

In [None]:
train, val, test = ud.design_matrices.get_by_settings(
                        sample_step='2 min',
                        split='all',
                        line=1,
                        lag='5 min',
                        dcwindow='2 min',
                        rn=10,
                        boawindow='2 min')

## Finding the minimum number of trees

50.

In [None]:
ras = []

for i in range(20):
    ra = mf.ResultsAggregator({'train':train, 'val':val, 'test':test},
                              target_name='y')

    features = [x for x in train if x!='y']

    for k in 1, 2, 5, 10, 50, 75, 100, 250:#, 500, 1000:
        rf = ske.RandomForestRegressor(n_estimators=k, n_jobs=-1)
        rf = rf.fit(train[features], train['y'])
        ra.append('rf_{:04}'.format(k), rf)
        
    ras.append(ra)

In [None]:
df = pd.concat([r.to_df()['rmse'].reset_index() for r in ras], ignore_index=True)
df = df.set_index('index').unstack()
df = df.reset_index()
df.columns = ['split', 'model', 'rmse']

In [None]:
with sns.axes_style('whitegrid'):
    sns.factorplot(x='model', y='rmse', hue='split', data=df,
                  palette='colorblind', size=7, legend_out=False,
                  facet_kws={'gridspec_kws':{}})
up.save_fig('w19/rf_line1_rmses_20reps.png', target='week')
up.save_fig('analysis/rf_line1_rmses_20reps.png')

In [None]:
with sns.axes_style('whitegrid'):
    sns.factorplot(x='model', y='rmse', hue='split', data=df,
                  palette='colorblind', size=7, legend_out=False,
                  facet_kws={'gridspec_kws':{}})
up.save_fig('w19/rf_line1_rmses_5reps.png', target='week')
up.save_fig('analysis/rf_line1_rmses_5reps.png')

## Building and evaluating good RF model


In [None]:
ra = mf.ResultsAggregator({'train':train, 'val':val, 'test':test},
                              target_name='y')

features = [x for x in train if x!='y']

fi = None

k = 100
for i in range(20):
    rf = ske.RandomForestRegressor(n_estimators=k, n_jobs=-1)
    rf = rf.fit(train[features], train['y'])
    ra.append('rf_L1_{:04}_{}'.format(k, i), rf)
    if fi is None:
        fi = rf.feature_importances_
    else:
        fi += rf.feature_importances_
df = ra.to_df()

#### Eval

In [None]:
df2 = pd.concat([df.mean(), df.max() - df.mean()], axis=1)
df2.round(2)

In [None]:
fidf = (pd.DataFrame({'feature':features, 
                   'importance':fi})
          .sort_values('importance', ascending=False))
fidf.head(10)

In [None]:
b = fidf.head(10).copy()
b['importance'] /= 20
print(b.to_latex())

#### Heatmap plots

In [None]:
for s in train,val,test:
    mf.plot_fit(s['y'], rf.predict(s[features]))

#  Line 2

100 trees

    tr 0.90
    va 2.54
    te 2.38
    
80%

    tr 1.07
    val 3.09
    te 2.92


In [None]:
train, val, test = ud.design_matrices.get_by_settings(
                        sample_step='2 min',
                        split='all',
                        line=2,
                        lag='5 min',
                        dcwindow='2 min',
                        rn=10,
                        boawindow='2 min')

## Finding the minimum number of trees

Also 50

In [None]:
ras = []

for i in range(5):
    ra = mf.ResultsAggregator({'train':train, 'val':val, 'test':test},
                              target_name='y')

    features = [x for x in train if x!='y']

    for k in 1, 2, 5, 10, 50, 75, 100, 250, 500, 1000:#, 1000, 2000:
        rf = ske.RandomForestRegressor(n_estimators=k, n_jobs=-1)
        rf = rf.fit(train[features], train['y'])
        ra.append('rf_{:04}'.format(k), rf)
        
    ras.append(ra)

In [None]:
df = pd.concat([r.to_df()['rmse'].reset_index() for r in ras], ignore_index=True)
df = df.set_index('index').unstack()
df = df.reset_index()
df.columns = ['split', 'model', 'rmse']

In [None]:
with sns.axes_style('whitegrid'):
    sns.factorplot(x='model', y='rmse', hue='split', data=df,
                  palette='colorblind', size=7, legend_out=False,
                  facet_kws={'gridspec_kws':{}})
up.save_fig('w19/rf_line2_rmses_5reps.png', target='week')

## Building and evaluating good RF model


In [None]:
ra = mf.ResultsAggregator({'train':train, 'val':val, 'test':test},
                              target_name='y')

features = [x for x in train if x!='y']


fi = None

k = 100
for i in range(20):
    rf = ske.RandomForestRegressor(n_estimators=k, n_jobs=-1)
    rf = rf.fit(train[features], train['y'])
    ra.append('rf_L2_{:04}_{}'.format(k, i), rf)
    if fi is None:
        fi = rf.feature_importances_
    else:
        fi += rf.feature_importances_
df = ra.to_df()

# rf = ske.RandomForestRegressor(n_estimators=100, n_jobs=-1)
# rf = rf.fit(train[features], train['y'])
# ra.append('rf_L2_{:04}'.format(k), rf)
# ra.to_df()

In [None]:
df2 = pd.concat([df.mean(), df.max() - df.mean()], axis=1)
df2.round(2)

In [None]:
fidf = (pd.DataFrame({'feature':features, 
                   'importance':fi})
          .sort_values('importance', ascending=False))
fidf.head(10)

In [None]:
b = fidf.head(10).copy()
b['importance'] /= 20
print(b.to_latex())

In [None]:
fi = (pd.DataFrame({'feature':features, 
                   'importance':rf.feature_importances_})
          .sort_values('importance', ascending=False))
fi.head(10)

#### Pred/res/target plots

In [None]:
for s in train,val,test:
    mf.plot_fit(s['y'], rf.predict(s[features]))

# 