### import statements

In [None]:
%matplotlib inline
import sys
import os
import pandas as pd
sys.path.append('../src')
import datetime
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pickle

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from plot_utils import *

### constants

In [None]:
F_INPUT = '../../data/v2/data.h5'
ds_label_all = 'r0' # ds stands for data source
ds_label_day = 'r0_day'
ds_label_ngt = 'r0_night'

In [None]:
DENSE_SUBSET_START = '2018-05-03'
DENSE_SUBSET_END = '2020-12-30'

In [None]:
smooth_mins = [1, 2, 5, 10, 15, 20]
r0_map = {'all':'r0_all','day':'r0_day','ngt':'r0_ngt'} # adn = all, day, ngt

In [None]:
feats_mnus_cn2 = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'solar_zenith_angle','dayofyear', 'hour']
feats_plus_cn2 = feats_mnus_cn2 + ['logCn2']
# feats_map = {'feats_plus_cn2': feats_plus_cn2, 'feats_mnus_cn2': feats_mnus_cn2}
feats_map = {'feats_plus_cn2': feats_plus_cn2}

In [None]:
# xmin, ymin = 0, 0
# xmax, ymax = 20, 20

### read data

In [None]:
df = pd.read_hdf(F_INPUT, 'merged')

### add new features

In [None]:
df['dayofyear'] = df.index.dayofyear
df['hour'] = df.index.hour
df['logCn2'] = np.log10(df['Cn2'])

### applying smoothing

In [None]:
for m in smooth_mins:
    for ds_label in [ds_label_all, ds_label_day, ds_label_ngt]:
        if ds_label == 'r0':
            label = ds_label + '_all_{}T'.format(m)
        elif ds_label == 'r0_night':
            label = 'r0_ngt_{}T'.format(m)
        else:
            label = ds_label + '_{}T'.format(m)
        if m == 1:
            df[label] = df[ds_label]
        else:
            df[label] = df[ds_label].rolling('{}T'.format(m)).mean()

In [None]:
df.columns

In [None]:
df.describe()

### restricting data to usable, relatively dense subset

In [None]:
df_subset = df[(df.index > DENSE_SUBSET_START) & (df.index < DENSE_SUBSET_END)]
df_subset.describe()

### resampling back down to 5 mins

In [None]:
df_subset_resamp5 = df_subset.resample('5 min').median()
df_subset_resamp5.describe()

In [None]:
df_subset_resamp5.to_hdf('df_subset_resamp5.r0.h5', 'df_subset_resamp5')
df_subset = df_subset_resamp5

### taking the train and test

In [None]:
split_date = '2019-12-31'
train = df_subset.index <= split_date
test  = df_subset.index > split_date

### creating non-nan masks

In [None]:
valid_masks = {}
for f in feats_map.keys(): 
    valid_masks[f] = {}
    for r in r0_map.keys():
        valid_masks[f][r] = {}
        for m in smooth_mins:
            label = '{}_{}T'.format(r0_map[r], str(m))
            feats_plus_label = feats_map[f] + [label]
            valid_masks[f][r][m] = ~df_subset[feats_plus_label].isnull().any(axis=1)
            print("{} {} {:2} {:7} {:7} {:7}".format(f, r, m, valid_masks[f][r][m].shape[0], np.sum(train & valid_masks[f][r][m]), np.sum(test & valid_masks[f][r][m])))

In [None]:
with open('valid_masks.r0.pkl', 'wb') as fh:
    pickle.dump(valid_masks, fh)

### train and test subroutine

In [None]:
regr = RandomForestRegressor(n_estimators=100, random_state=0)

def train_and_test(train_df, test_df, feats, label):
    forest = regr.fit(train_df[feats], train_df[label])
#     r2 = regr.score(test_df[feats], test_df[label])
    preds = regr.predict(test_df[feats])
    r2 = r2_score(test_df[label], preds)
    sq_err = mean_squared_error(test_df[label], preds)
    perc_err = mean_absolute_percentage_error(test_df[label], preds)
#     return {'forest': forest, 'preds': preds, 'r2': r2, 'sq_err': sq_err, 'perc_err': perc_err}
    return {'preds': preds, 'r2': r2, 'sq_err': sq_err, 'perc_err': perc_err}


### Get All Results

In [None]:
results = {}
for f in feats_map.keys(): 
    results[f] = {}
    for r in r0_map.keys():
        results[f][r] = {}
        for m in smooth_mins:
            print("Progress: {} {} {}".format(f, r, m))
            label = '{}_{}T'.format(r0_map[r], str(m))
            feats = feats_map[f]
            valid = valid_masks[f][r][m]
            results[f][r][m] = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, label)
            with open('results.r0.pkl', 'wb') as fh:
                pickle.dump(results, fh)

### Compare all Results

In [None]:
# for r in [ results_all_1T, results_all_2T, results_all_5T, results_all_10T, results_all_15T, results_all_20T ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# for r in [ results_day_1T, results_day_2T, results_day_5T, results_day_10T, results_day_15T, results_day_20T ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# for r in [ results_night_1T, results_night_2T, results_night_5T, results_night_10T, results_night_15T, results_night_20T ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [None]:
# error_by_r0_histograms(test_truth_all, error_perc(test_truth_all, test_preds_all), 0, 80)
# error_by_r0_histograms(test_truth_day, error_perc(test_truth_day, test_preds_day), 0, 80)
# error_by_r0_histograms(test_truth_night, error_perc(test_truth_night, test_preds_night), 0, 80)

In [None]:
# test_truth_night = df_subset.loc[test&valid_night,label_night]
# test_truth_day = df_subset.loc[test&valid_day,label_day]
# test_truth_all = df_subset.loc[test&valid,label]

In [None]:
# test_truth_night_15T = df_subset.loc[test&valid_night,'r0_night_15T']
# test_pred_night_15T = results_night_10T['preds']
# error_by_r0_histograms(test_truth_night_15T, error_perc(test_truth_night_15T, test_pred_night_15T), 0, 80)
# scatter_with_errors(test_truth_night_15T, test_pred_night_15T, error_perc, xmin, xmax, ymin, ymax)

In [None]:
# test_truth_day_15T = df_subset.loc[test&valid_day,'r0_day_15T']
# test_pred_day_15T = results_day_15T['preds']
# error_by_r0_histograms(test_truth_day_15T, error_perc(test_truth_day_15T, test_pred_day_15T), 0, 90)
# scatter_with_errors(test_truth_day_15T, test_pred_day_15T, error_perc, xmin, xmax, ymin, ymax)

### Plotting

#### scatter plots of actual vs. predict using error_diff

In [None]:
# %matplotlib inline
# scatter_with_errors(test_truth_all, test_preds_all, error_perc, xmin, xmax, ymin, ymax)

# scatter_with_errors(test_truth_night, test_preds_night, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_night,feats], feats)

# scatter_with_errors(test_truth_night, test_preds_night, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_night,feats], feats)

#### interactive time domain plot of errors

switching matplotlib to notebook mode to enable a zoom-in of different portions of the time axis

In [None]:
# %matplotlib notebook 
# plot_errors_in_time(test_truth_all, test_preds_all)

#### feature importance

feature importance from the model

In [None]:
# %matplotlib inline

In [None]:
# plot_importance(regr, df_subset.loc[train&valid,feats], feats)

## debug why CN2 is so low

#### What happens if I drop month and SZA

Answer: turns out we had to take the log of CN2

In [None]:
# feats_no_sza = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'logCn2']
# preds_all_no_sza, r2_all_no_sza = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats_no_sza, label)
# scatter_with_errors(test_truth_all, preds_all_no_sza, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid,feats_no_sza], feats_no_sza)
# r2_all_no_sza

#### correlation between the signals using [stats.pearsonr](https://towardsdatascience.com/four-ways-to-quantify-synchrony-between-time-series-data-b99136c4a9c9)

We calculate:
- overall synchrony between r0 and Cn2
- local synchrony between r0 and Cn2

#### Overall Synchrony

In [None]:
    # r, p = print_pearsonr(df_subset.loc[train&valid,label], df_subset.loc[train&valid,'logCn2'])
# plot_overall_synchrony(df_subset.loc[train&valid,label], df_subset.loc[train&valid,['logCn2']], label, 'logCn2', r)

#### Local Synchrony

In [None]:
# plot_local_synchrony(df_subset.loc[train&valid,label], df_subset.loc[train&valid,['logCn2']], label, 'Cn2')

## Synchrony using only R0 daytime data

#### Overall Synchrony

In [None]:
# r, p = print_pearsonr(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,'logCn2'])
# r

In [None]:
# plot_overall_synchrony(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,['logCn2']], label_day, 'logCn2', r)

#### Local Synchrony

In [None]:
# plot_local_synchrony(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,['logCn2']], label_day, 'logCn2')

## Try comparing results without CN2

In [None]:
# results_all_15T_nocn2 = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats_minus_cn2, 'r0_15T')

In [None]:
# results_day_15T_nocn2 = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats_minus_cn2, 'r0_day_15T')

In [None]:
# results_night_15T_nocn2 = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats_minus_cn2, 'r0_night_15T')

In [None]:
# for r in [ results_all_15T, results_all_15T_nocn2 ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# for r in [ results_day_15T, results_day_15T_nocn2 ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# for r in [ results_night_15T, results_night_15T_nocn2 ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# plot_importance(results_all_15T['forest'], df_subset.loc[train & valid, feats], feats)

In [None]:
# test_truth_night_15T = df_subset.loc[test&valid_night,'r0_night_15T']
# test_pred_night_15T = results_night_10T['preds']
# error_by_r0_histograms(test_truth_night_15T, error_perc(test_truth_night_15T, test_pred_night_15T), 0, 80)
# scatter_with_errors(test_truth_night_15T, test_pred_night_15T, error_perc, xmin, xmax, ymin, ymax)

## Why R2 so weird?

In [None]:
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_2T'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_5T'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_10T'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_15T'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_20T'])))

## Plot One Day

In [None]:
# def plot_one_day(preds, smooth_df, orig_df, startdate, enddate):

#     preds_df = pd.DataFrame(preds, index=smooth_df.index) 

#     daymask = (orig_df.index > startdate ) & (orig_df.index < enddate)

#     plt.figure(figsize=(20, 5))
#     plt.plot(orig_df[daymask], 'g.', label='r0')
#     plt.plot(smooth_df[daymask], 'b.', label='r0 smoothed (15min)')
#     plt.plot(preds_df[daymask], 'r.', label='preds')
#     plt.ylabel('r0')
#     plt.legend()
#     return

In [None]:
# plot_one_day(results_all_15T['preds'], df_subset.loc[test & valid, 'r0_15T'], df_subset.loc[test & valid, 'r0'], '2020-05-30 04:00', '2020-05-31')


In [None]:
# plot_one_day(results_all_15T['preds'], df_subset.loc[test & valid, 'r0_15T'], df_subset.loc[test & valid, 'r0'], '2020-08-30 04:00', '2020-08-31 00:00')

In [None]:
# plot_one_day(results_all_15T['preds'], df_subset.loc[test & valid, 'r0_15T'], df_subset.loc[test & valid, 'r0'], '2020-02-15 04:00', '2020-02-16 00:00')

In [None]:
# plot_one_day(results_all_15T['preds'], df_subset.loc[test & valid, 'r0_15T'], df_subset.loc[test & valid, 'r0'], '2020-11-15 04:00', '2020-11-16 00:00')