### import statements

In [None]:
%matplotlib inline
import sys
import os
import pandas as pd
sys.path.append('../src')
# from utils import load_weather
import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import gaussian_kde

### constants

In [None]:
F_INPUT = '../../data/data.merged.pandas.h5'

In [None]:
feats = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'month', 'Cn2', 'solar_zenith_angle']
label = 'r0'

### read data

In [None]:
df = pd.read_hdf(F_INPUT, 'resampled/median')

### add new features

In [None]:
df['month'] = df['datenum'].index.month

### restricting data to usable, relatively dense subset

In [None]:
df_subset = df[(df.index >= '2018-10-01') & (df.index < '2019-01-01')]

In [None]:
df_subset.describe()

### finding non-nan values

In [None]:
valid = ~df_subset.isnull().any(axis=1)

In [None]:
df_subset[valid].count()

### splitting into train and test

In [None]:
train = df_subset.index <= '2018-12-10'
test  = df_subset.index > '2018-12-10'

In [None]:
df_subset.loc[train & valid,feats].count()

In [None]:
df_subset.loc[test & valid,feats].count()

### initializing the RF regressor

In [None]:
regr = RandomForestRegressor(n_estimators=100, random_state=0)
regr.fit(df_subset.loc[train&valid,feats], df_subset.loc[train&valid,label])

### predict on test set

In [None]:
preds = regr.predict(df_subset.loc[test&valid,feats])

### evaluation

In [None]:
r2 = regr.score(df_subset.loc[test&valid,feats], df_subset.loc[test&valid,label])

In [None]:
r2

In [None]:
def error_diff(targ, pred):
    return targ-pred
def error_perc(targ, pred):
    return (targ-pred)/targ

### plots

In [None]:
r0_bins = np.arange(0,25,0.5)
act_pred_scatter_xbins, act_pred_scatter_ybins = np.meshgrid(r0_bins, r0_bins)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 10))
s = 25
a = 0.4
ax[0].scatter(df_subset.loc[test&valid,label],
           preds,
           edgecolor='k', c="cornflowerblue", s=s, alpha=a)
# ax[0].hist2d(df_subset.loc[test&valid,label],
#            preds,act_pred_scatter_bins, cmap='jet')
x = np.linspace(df_subset.loc[test&valid,label].min(), df_subset.loc[test&valid,label].max(), 1000)
ax[0].plot(x, x, 'r-')
ax[0].set_xlabel("Actual r0")
ax[0].set_ylabel("Predicted r0")
ax[0].set_xticks(np.arange(0, 25))
ax[0].set_yticks(np.arange(0, 25))

ax[1].scatter(df_subset.loc[test&valid,label],
              error_perc(df_subset.loc[test&valid,label], preds),
              edgecolor='k', c="forestgreen", s=s, alpha=a)
ax[1].plot(x, np.zeros(x.shape), 'r-')
ax[1].set_xlabel("Actual r0")
ax[1].set_ylabel("Error r0")
ax[1].set_xticks(np.arange(0, 25))


plt.show()


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 10))
s = 25
a = 0.4
ax[0].scatter(df_subset.loc[test&valid,label],
           preds,
           edgecolor='k', c="cornflowerblue", s=s, alpha=a)
# ax[0].hist2d(df_subset.loc[test&valid,label],
#            preds,act_pred_scatter_bins, cmap='jet')
x = np.linspace(df_subset.loc[test&valid,label].min(), df_subset.loc[test&valid,label].max(), 1000)
ax[0].plot(x, x, 'r-')
ax[0].set_xlabel("Actual r0")
ax[0].set_ylabel("Predicted r0")
ax[0].set_xticks(np.arange(0, 25))
ax[0].set_yticks(np.arange(0, 25))

ax[1].scatter(df_subset.loc[test&valid,label],
              error_perc(df_subset.loc[test&valid,label], preds),
              edgecolor='k', c="forestgreen", s=s, alpha=a)
ax[1].plot(x, np.zeros(x.shape), 'r-')
ax[1].set_xlabel("Actual r0")
ax[1].set_ylabel("Error r0")
ax[1].set_xticks(np.arange(0, 25))


plt.show()


### evaluation

In [None]:
r2 = regr.score(df_subset.loc[test&valid,feats], df_subset.loc[test&valid,label])

In [None]:
r2

In [None]:
def error_diff(targ, pred):
    return targ-pred
def error_perc(targ, pred):
    return (targ-pred)/targ

### plots

In [None]:
r0_bins = np.arange(0,25,0.5)
act_pred_scatter_xbins, act_pred_scatter_ybins = np.meshgrid(r0_bins, r0_bins)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 10))
s = 25
a = 0.4
ax[0].scatter(df_subset.loc[test&valid,label],
           preds,
           edgecolor='k', c="cornflowerblue", s=s, alpha=a)
# ax[0].hist2d(df_subset.loc[test&valid,label],
#            preds,act_pred_scatter_bins, cmap='jet')
x = np.linspace(df_subset.loc[test&valid,label].min(), df_subset.loc[test&valid,label].max(), 1000)
ax[0].plot(x, x, 'r-')
ax[0].set_xlabel("Actual r0")
ax[0].set_ylabel("Predicted r0")
ax[0].set_xticks(np.arange(0, 25))
ax[0].set_yticks(np.arange(0, 25))

ax[1].scatter(df_subset.loc[test&valid,label],
              error_perc(df_subset.loc[test&valid,label], preds),
              edgecolor='k', c="forestgreen", s=s, alpha=a)
ax[1].plot(x, np.zeros(x.shape), 'r-')
ax[1].set_xlabel("Actual r0")
ax[1].set_ylabel("Perc Error r0")
ax[1].set_xticks(np.arange(0, 25))


plt.show()
