### import statements

In [None]:
%matplotlib inline
import sys
import os
import pandas as pd
sys.path.append('../src')
# from utils import load_weather
import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import gaussian_kde
import seaborn as sns
import scipy.stats as stats

### constants

In [None]:
F_INPUT = '../../data/data.merged.pandas.h5'

In [None]:
feats = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'month', 'Cn2', 'solar_zenith_angle','day']
label = 'r0'

### read data

In [None]:
df = pd.read_hdf(F_INPUT, 'resampled/median')

### add new features

In [None]:
df['month'] = df['datenum'].index.month
df['day'] = df['datenum'].index.dayofyear

### restricting data to usable, relatively dense subset

In [None]:
df_subset = df[(df.index >= '2018-10-01') & (df.index < '2019-01-01')]

In [None]:
df_subset.describe()

### finding non-nan values

In [None]:
valid = ~df_subset.isnull().any(axis=1)

In [None]:
df_subset[valid].count()

### splitting into train and test

In [None]:
train = df_subset.index <= '2018-12-10'
test  = df_subset.index > '2018-12-10'

In [None]:
df_subset.loc[train & valid,feats].count()

In [None]:
df_subset.loc[test & valid,feats].count()

### initializing the RF regressor

In [None]:
regr = RandomForestRegressor(n_estimators=100, random_state=0)
regr.fit(df_subset.loc[train&valid,feats], df_subset.loc[train&valid,label])

### predict on test set

In [None]:
preds = regr.predict(df_subset.loc[test&valid,feats])

### evaluation

In [None]:
r2 = regr.score(df_subset.loc[test&valid,feats], df_subset.loc[test&valid,label])

In [None]:
r2

In [None]:
def error_diff(targ, pred):
    return targ-pred
def error_perc(targ, pred):
    return (targ-pred)/targ

### plots

In [None]:
r0_bins = np.arange(0,25,0.5)
act_pred_scatter_xbins, act_pred_scatter_ybins = np.meshgrid(r0_bins, r0_bins)

#### scatter plots of actual vs. predict using error_diff

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 10))
s = 25
a = 0.4
ax[0].scatter(df_subset.loc[test&valid,label],
           preds,
           edgecolor='k', c="cornflowerblue", s=s, alpha=a)
# ax[0].hist2d(df_subset.loc[test&valid,label],
#            preds,act_pred_scatter_bins, cmap='jet')
x = np.linspace(df_subset.loc[test&valid,label].min(), df_subset.loc[test&valid,label].max(), 1000)
ax[0].plot(x, x, 'r-')
ax[0].set_xlabel("Actual r0")
ax[0].set_ylabel("Predicted r0")
ax[0].set_xticks(np.arange(0, 25))
ax[0].set_yticks(np.arange(0, 25))

ax[1].scatter(df_subset.loc[test&valid,label],
              error_diff(df_subset.loc[test&valid,label], preds),
              edgecolor='k', c="forestgreen", s=s, alpha=a)
ax[1].plot(x, np.zeros(x.shape), 'r-')
ax[1].set_xlabel("Actual r0")
ax[1].set_ylabel("Error r0")
ax[1].set_xticks(np.arange(0, 25))


plt.show()


#### scatter plots of actual vs. predict using error_perc

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 10))
s = 25
a = 0.4
ax[0].scatter(df_subset.loc[test&valid,label],
           preds,
           edgecolor='k', c="cornflowerblue", s=s, alpha=a)
# ax[0].hist2d(df_subset.loc[test&valid,label],
#            preds,act_pred_scatter_bins, cmap='jet')
x = np.linspace(df_subset.loc[test&valid,label].min(), df_subset.loc[test&valid,label].max(), 1000)
ax[0].plot(x, x, 'r-')
ax[0].set_xlabel("Actual r0")
ax[0].set_ylabel("Predicted r0")
ax[0].set_xticks(np.arange(0, 25))
ax[0].set_yticks(np.arange(0, 25))

ax[1].scatter(df_subset.loc[test&valid,label],
              error_perc(df_subset.loc[test&valid,label], preds),
              edgecolor='k', c="forestgreen", s=s, alpha=a)
ax[1].plot(x, np.zeros(x.shape), 'r-')
ax[1].set_xlabel("Actual r0")
ax[1].set_ylabel("Perc Error r0")
ax[1].set_xticks(np.arange(0, 25))


plt.show()


#### time domain plot of errors

switching matplotlib to notebook mode to enable a zoom-in of different portions of the time axis

In [None]:
%matplotlib notebook 
fig, ax = plt.subplots(3, 1)
ax[0].plot(df_subset.loc[test&valid,label].index, df_subset.loc[test&valid,label], 'gx', label='actual')
ax[0].plot(df_subset.loc[test&valid,label].index, preds, 'ro', label='predicted')
ax[0].set_xlabel("Datetime")
ax[0].set_ylabel("r0")
ax[0].legend()

ax[1].plot(df_subset.loc[test&valid,label].index, error_diff(df_subset.loc[test&valid,label], preds), 'bx')
ax[1].set_xlabel("Datetime")
ax[1].set_ylabel("error r0")

ax[2].plot(df_subset.loc[test&valid,label].index, error_perc(df_subset.loc[test&valid,label], preds), 'bx')
ax[2].set_xlabel("Datetime")
ax[2].set_ylabel("perc error r0")
plt.show()

#### feature importance

feature importance from the model

In [None]:
%matplotlib inline
def plot_importance(forest, X, featnames):
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print(f"{f + 1}. {featnames[indices[f]]:20} ({importances[indices[f]]})")

    # Plot the impurity-based feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices],
            color="r", yerr=std[indices], align="center")
    plt.xticks(range(X.shape[1]), [ featnames[i] for i in indices ], rotation='vertical')
    plt.xlim([-1, X.shape[1]])
    plt.show()

In [None]:
plot_importance(regr, df_subset.loc[train&valid,feats], feats)

#### debug why CN2 is so low

1. What happens if I drop month and SZA
2. Let's look at correlation between the signals using [stats.pearsonr](https://towardsdatascience.com/four-ways-to-quantify-synchrony-between-time-series-data-b99136c4a9c9)

We calculate:
- overall synchrony between r0 and Cn2
- local synchrony between r0 and Cn2

#### Overall Synchrony

In [None]:
overall_pearson_r = df_subset.loc[train&valid,feats].corr().iloc[0,1]
print(f"Pandas computed Pearson r: {overall_pearson_r}")

r, p = stats.pearsonr(df_subset.loc[train&valid,label], df_subset.loc[train&valid,'Cn2'])
print(f"Scipy computed Pearson r: {r} and p-value: {p}")
# # out: Scipy computed Pearson r: 0.20587745135619354 and p-value: 3.7902989479463397e-51

# Compute rolling window synchrony
f,ax=plt.subplots(2, 1, figsize=(7,3), sharex=True)
ax[0].plot(df_subset.loc[train&valid,label], label=label)
ax[1].plot(df_subset.loc[train&valid,['Cn2']], label='Cn2')
# ax[1].set(title=f"Overall Pearson r = {np.round(r,2)}");

#### Local Synchrony

In [None]:
# Set window size to compute moving window synchrony.
r_window_size = 120
# Compute rolling window synchrony
rolling_r = df_subset.loc[train&valid,label].rolling(window=r_window_size, center=True).corr(df_subset.loc[train&valid,'Cn2'])
f,ax=plt.subplots(3,1,figsize=(14,6),sharex=True)
ax[0].plot(df_subset.loc[train&valid,label], label=label)
ax[1].plot(df_subset.loc[train&valid,['Cn2']], label='Cn2')
rolling_r.plot(ax=ax[2])
ax[0].set(ylabel='r0')
ax[1].set(ylabel='Cn2')
ax[2].set(ylabel='Pearson r')
plt.suptitle("Smiling data and rolling window correlation")