### import statements

In [1]:
%matplotlib inline
import sys
import os
import pandas as pd
sys.path.append('../src')
import datetime
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pickle

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from plot_utils import *

### constants

In [3]:
F_INPUT = '../../data/v2/data.h5'
ds_label_all = 'r0' # ds stands for data source
ds_label_day = 'r0_day'
ds_label_ngt = 'r0_night'

In [4]:
DENSE_SUBSET_START = '2018-05-03'
DENSE_SUBSET_END = '2020-12-30'

In [5]:
smooth_mins = [1, 2, 5, 10, 15, 20]
r0_map = {'all':'r0_all','day':'r0_day','ngt':'r0_ngt'} # adn = all, day, ngt

In [6]:
feats_mnus_cn2 = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'solar_zenith_angle','dayofyear', 'hour']
feats_plus_cn2 = feats_mnus_cn2 + ['logCn2']
feats_map = {'feats_plus_cn2': feats_plus_cn2, 'feats_mnus_cn2': feats_mnus_cn2}

In [7]:
# xmin, ymin = 0, 0
# xmax, ymax = 20, 20

### read data

In [8]:
df = pd.read_hdf(F_INPUT, 'merged')

### add new features

In [9]:
df['dayofyear'] = df.index.dayofyear
df['hour'] = df.index.hour
df['logCn2'] = np.log10(df['Cn2'])

### applying smoothing

In [10]:
for m in smooth_mins:
    for ds_label in [ds_label_all, ds_label_day, ds_label_ngt]:
        if ds_label == 'r0':
            label = ds_label + '_all_{}T'.format(m)
        elif ds_label == 'r0_night':
            label = 'r0_ngt_{}T'.format(m)
        else:
            label = ds_label + '_{}T'.format(m)
        if m == 1:
            df[label] = df[ds_label]
        else:
            df[label] = df[ds_label].rolling('{}T'.format(m)).mean()

In [11]:
df.columns

Index(['o(I)_I', 'r0', 'solar_zenith_angle', 'r0_day', 'r0_night', 'pressure',
       'relative_humidity', 'temperature', 'wind_direction', 'wind_speed',
       'Cn2', 'dayofyear', 'hour', 'logCn2', 'r0_all_1T', 'r0_day_1T',
       'r0_ngt_1T', 'r0_all_2T', 'r0_day_2T', 'r0_ngt_2T', 'r0_all_5T',
       'r0_day_5T', 'r0_ngt_5T', 'r0_all_10T', 'r0_day_10T', 'r0_ngt_10T',
       'r0_all_15T', 'r0_day_15T', 'r0_ngt_15T', 'r0_all_20T', 'r0_day_20T',
       'r0_ngt_20T'],
      dtype='object')

In [12]:
df.describe()

Unnamed: 0,o(I)_I,r0,solar_zenith_angle,r0_day,r0_night,pressure,relative_humidity,temperature,wind_direction,wind_speed,...,r0_ngt_5T,r0_all_10T,r0_day_10T,r0_ngt_10T,r0_all_15T,r0_day_15T,r0_ngt_15T,r0_all_20T,r0_day_20T,r0_ngt_20T
count,342443.0,483758.0,483758.0,342443.0,141315.0,1740394.0,1653058.0,1653063.0,1783564.0,1783640.0,...,258373.0,688752.0,417620.0,271132.0,715555.0,436672.0,278883.0,736339.0,451326.0,285013.0
mean,0.001379,4.729702,71.75721,3.651714,7.341948,774.6396,35.42895,11.03738,190.8795,2.313792,...,7.335459,5.099589,3.656218,7.322788,5.086838,3.664591,7.313778,5.081379,3.675858,7.307061
std,0.00045,2.933261,41.077,2.356312,2.521823,4.565213,24.07449,7.99899,82.32246,1.7204,...,2.474673,3.120671,2.624671,2.444669,3.124712,2.65492,2.426685,3.12282,2.670015,2.413242
min,0.000102,0.821452,10.943276,0.821452,2.3674,751.6,1.0,-13.0,1.45,0.035,...,2.3674,0.821452,0.821452,2.3674,0.821452,0.821452,2.3674,0.821452,0.821452,2.3674
25%,0.00106,2.571765,42.658571,2.299625,5.5002,772.1,19.0,5.4,140.1579,1.185,...,5.532833,2.712008,2.322928,5.543217,2.702915,2.322861,5.551622,2.700245,2.324379,5.560275
50%,0.001353,3.801627,57.746335,3.012749,7.0577,775.3,29.0,11.1,187.9474,1.845,...,7.072533,4.245727,2.968483,7.074229,4.221276,2.96606,7.076489,4.212579,2.96829,7.076489
75%,0.001649,6.273073,112.630556,4.262205,8.9089,778.0,45.0,17.4,255.6842,2.865,...,8.888133,6.9242,4.122536,8.87152,6.9153,4.123222,8.856551,6.913749,4.133728,8.85105
max,0.0035,86.039498,169.053912,86.039498,24.8577,785.6,110.0,31.8,360.0,31.96667,...,24.8577,86.039498,86.039498,22.07645,86.039498,86.039498,22.07645,86.039498,86.039498,20.808942


### restricting data to usable, relatively dense subset

In [13]:
df_subset = df[(df.index > DENSE_SUBSET_START) & (df.index < DENSE_SUBSET_END)]
df_subset.describe()

Unnamed: 0,o(I)_I,r0,solar_zenith_angle,r0_day,r0_night,pressure,relative_humidity,temperature,wind_direction,wind_speed,...,r0_ngt_5T,r0_all_10T,r0_day_10T,r0_ngt_10T,r0_all_15T,r0_day_15T,r0_ngt_15T,r0_all_20T,r0_day_20T,r0_ngt_20T
count,273728.0,413091.0,413091.0,273728.0,139363.0,1252046.0,1164787.0,1164792.0,1280385.0,1280461.0,...,254844.0,598586.0,331070.0,267516.0,620687.0,345468.0,275219.0,637842.0,356538.0,281304.0
mean,0.001382,4.87189,74.69819,3.607425,7.355473,774.4021,35.32743,11.04439,174.7172,2.327763,...,7.350375,5.288679,3.633298,7.33733,5.277392,3.643771,7.327991,5.272362,3.655958,7.321068
std,0.000448,2.995811,43.19205,2.357307,2.525811,4.687234,24.06511,7.993629,67.63609,1.770945,...,2.478042,3.153837,2.647643,2.448015,3.154897,2.675779,2.430055,3.152587,2.692277,2.416628
min,0.000102,0.821452,10.943276,0.821452,2.3674,751.6,1.0,-13.0,2.05,0.035,...,2.3674,0.821452,0.821452,2.3674,0.821452,0.821452,2.3674,0.821452,0.821452,2.3674
25%,0.001066,2.599046,42.089154,2.273231,5.5091,771.8,19.0,5.2,116.3684,1.163158,...,5.5447,2.77462,2.302938,5.557414,2.76612,2.30335,5.56428,2.763078,2.305506,5.5714
50%,0.00136,3.9694,58.682654,2.96627,7.0666,775.1,29.0,11.1,183.55,1.826316,...,7.0844,4.590917,2.927821,7.087367,4.576505,2.924869,7.089344,4.567925,2.926963,7.08885
75%,0.001653,6.553345,118.005308,4.196203,8.9267,777.9,45.0,17.5,209.0526,2.88,...,8.905933,7.21256,4.079838,8.889617,7.207022,4.084673,8.876267,7.209,4.099386,8.869192
max,0.0035,86.039498,169.053912,86.039498,24.8577,783.8,110.0,31.8,360.0,31.96667,...,24.8577,86.039498,86.039498,22.07645,86.039498,86.039498,22.07645,86.039498,86.039498,20.808942


### resampling back down to 5 mins

In [14]:
df_subset_resamp5 = df_subset.resample('5 min').median()
df_subset_resamp5.describe()

Unnamed: 0,o(I)_I,r0,solar_zenith_angle,r0_day,r0_night,pressure,relative_humidity,temperature,wind_direction,wind_speed,...,r0_ngt_5T,r0_all_10T,r0_day_10T,r0_ngt_10T,r0_all_15T,r0_day_15T,r0_ngt_15T,r0_all_20T,r0_day_20T,r0_ngt_20T
count,61816.0,112767.0,112767.0,61816.0,50951.0,256360.0,239076.0,239077.0,256372.0,256386.0,...,53106.0,123307.0,68556.0,54751.0,126881.0,70871.0,56010.0,129839.0,72747.0,57092.0
mean,0.001417,5.28266,84.396171,3.577739,7.351144,774.40145,35.501078,11.022659,174.671009,2.320158,...,7.339237,5.274953,3.634919,7.328508,5.270614,3.649666,7.321646,5.270526,3.665283,7.315937
std,0.00048,3.156839,44.883292,2.572347,2.494635,4.685693,24.264907,8.002353,65.553669,1.746939,...,2.480251,3.183171,2.713971,2.451988,3.181179,2.735333,2.434412,3.175887,2.74655,2.420576
min,0.000103,0.821452,10.945537,0.821452,2.3674,752.1,1.0,-13.0,6.0,0.1,...,2.3674,0.821452,0.821452,2.3674,0.821452,0.821452,2.3674,0.821452,0.821452,2.3674
25%,0.0011,2.742663,46.747923,2.249879,5.5358,771.8,19.0,5.2,117.1,1.17,...,5.54025,2.756131,2.286729,5.547667,2.754904,2.291331,5.55805,2.755582,2.297297,5.564725
50%,0.001391,4.598131,67.380905,2.89097,7.0844,775.1,29.0,11.1,183.7,1.82,...,7.081433,4.5657,2.916605,7.082917,4.561744,2.918465,7.084845,4.56389,2.922754,7.085142
75%,0.001665,7.21345,124.278989,4.056879,8.91335,777.8,45.0,17.5,207.526316,2.866667,...,8.9,7.204868,4.077276,8.885463,7.206033,4.088772,8.8733,7.207808,4.106606,8.865285
max,0.0035,81.041189,169.04853,81.041189,23.229,783.8,110.0,31.5,353.789474,20.285,...,22.07645,77.232305,77.232305,22.07645,77.232305,77.232305,22.07645,77.232305,77.232305,20.613085


### taking the train and test

In [15]:
split_date = '2019-12-31'
train = df_subset.index <= split_date
test  = df_subset.index > split_date

### creating non-nan masks

In [16]:
valid_masks = {}
for f in feats_map.keys(): 
    valid_masks[f] = {}
    for r in r0_map.keys():
        valid_masks[f][r] = {}
        for m in smooth_mins:
            label = '{}_{}T'.format(r0_map[r], str(m))
            feats_plus_label = feats_map[f] + [label]
            valid_masks[f][r][m] = ~df_subset[feats_plus_label].isnull().any(axis=1)
            print("{} {} {:2} {:7} {:7}".format(f, r, m, np.sum(train & valid_masks[f][r][m]), np.sum(test & valid_masks[f][r][m])))

feats_plus_cn2 all  1  251681   98859
feats_plus_cn2 all  2  251681   98859
feats_plus_cn2 all  5  251681   98859
feats_plus_cn2 all 10  251681   98859
feats_plus_cn2 all 15  251681   98859
feats_plus_cn2 all 20  251681   98859
feats_plus_cn2 day  1  174044   53074
feats_plus_cn2 day  2  174044   53074
feats_plus_cn2 day  5  174044   53074
feats_plus_cn2 day 10  174044   53074
feats_plus_cn2 day 15  174044   53074
feats_plus_cn2 day 20  174044   53074
feats_plus_cn2 ngt  1   77637   45785
feats_plus_cn2 ngt  2   77637   45785
feats_plus_cn2 ngt  5   77637   45785
feats_plus_cn2 ngt 10   77637   45785
feats_plus_cn2 ngt 15   77637   45785
feats_plus_cn2 ngt 20   77637   45785
feats_mnus_cn2 all  1  277105  103478
feats_mnus_cn2 all  2  277105  103478
feats_mnus_cn2 all  5  277105  103478
feats_mnus_cn2 all 10  277105  103478
feats_mnus_cn2 all 15  277105  103478
feats_mnus_cn2 all 20  277105  103478
feats_mnus_cn2 day  1  197939   56874
feats_mnus_cn2 day  2  197939   56874
feats_mnus_c

### train and test subroutine

In [17]:
def train_and_test(train_df, test_df, feats, label):
    regr = RandomForestRegressor(n_estimators=100, random_state=0)
    forest = regr.fit(train_df[feats], train_df[label])
#     r2 = regr.score(test_df[feats], test_df[label])
    preds = regr.predict(test_df[feats])
    r2 = r2_score(test_df[label], preds)
    sq_err = mean_squared_error(test_df[label], preds)
    perc_err = mean_absolute_percentage_error(test_df[label], preds)
    return {'forest': forest, 'preds': preds, 'r2': r2, 'sq_err': sq_err, 'perc_err': perc_err}

### Get All Results

In [None]:
results = {}
for f in feats_map.keys(): 
    results[f] = {}
    for r in r0_map.keys():
        results[f][r] = {}
        for m in smooth_mins:
            print("Progress: {} {} {}".format(f, r, m))
            label = '{}_{}T'.format(r0_map[r], str(m))
            feats = feats_map[f]
            valid = valid_masks[f][r][m]
            results[f][r][m] = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, label)
            with open('data.merged.smoothed.resampled.pkl', 'wb') as fh:
                pickle.dump(results, fh)

Progress: feats_plus_cn2 all 1


### Save File

In [None]:
with open('data.merged.smoothed.resampled.pkl', 'wb') as fh:
    pickle.dump(results, fh)

### Compare all Results

In [None]:
# for r in [ results_all_1T, results_all_2T, results_all_5T, results_all_10T, results_all_15T, results_all_20T ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# for r in [ results_day_1T, results_day_2T, results_day_5T, results_day_10T, results_day_15T, results_day_20T ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# for r in [ results_night_1T, results_night_2T, results_night_5T, results_night_10T, results_night_15T, results_night_20T ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [None]:
# error_by_r0_histograms(test_truth_all, error_perc(test_truth_all, test_preds_all), 0, 80)
# error_by_r0_histograms(test_truth_day, error_perc(test_truth_day, test_preds_day), 0, 80)
# error_by_r0_histograms(test_truth_night, error_perc(test_truth_night, test_preds_night), 0, 80)

In [None]:
# test_truth_night = df_subset.loc[test&valid_night,label_night]
# test_truth_day = df_subset.loc[test&valid_day,label_day]
# test_truth_all = df_subset.loc[test&valid,label]

In [None]:
# test_truth_night_15T = df_subset.loc[test&valid_night,'r0_night_15T']
# test_pred_night_15T = results_night_10T['preds']
# error_by_r0_histograms(test_truth_night_15T, error_perc(test_truth_night_15T, test_pred_night_15T), 0, 80)
# scatter_with_errors(test_truth_night_15T, test_pred_night_15T, error_perc, xmin, xmax, ymin, ymax)

In [None]:
# test_truth_day_15T = df_subset.loc[test&valid_day,'r0_day_15T']
# test_pred_day_15T = results_day_15T['preds']
# error_by_r0_histograms(test_truth_day_15T, error_perc(test_truth_day_15T, test_pred_day_15T), 0, 90)
# scatter_with_errors(test_truth_day_15T, test_pred_day_15T, error_perc, xmin, xmax, ymin, ymax)

### Plotting

#### scatter plots of actual vs. predict using error_diff

In [None]:
# %matplotlib inline
# scatter_with_errors(test_truth_all, test_preds_all, error_perc, xmin, xmax, ymin, ymax)

# scatter_with_errors(test_truth_night, test_preds_night, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_night,feats], feats)

# scatter_with_errors(test_truth_night, test_preds_night, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_night,feats], feats)

#### interactive time domain plot of errors

switching matplotlib to notebook mode to enable a zoom-in of different portions of the time axis

In [None]:
# %matplotlib notebook 
# plot_errors_in_time(test_truth_all, test_preds_all)

#### feature importance

feature importance from the model

In [None]:
# %matplotlib inline

In [None]:
# plot_importance(regr, df_subset.loc[train&valid,feats], feats)

## debug why CN2 is so low

#### What happens if I drop month and SZA

Answer: turns out we had to take the log of CN2

In [None]:
# feats_no_sza = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'logCn2']
# preds_all_no_sza, r2_all_no_sza = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats_no_sza, label)
# scatter_with_errors(test_truth_all, preds_all_no_sza, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid,feats_no_sza], feats_no_sza)
# r2_all_no_sza

#### correlation between the signals using [stats.pearsonr](https://towardsdatascience.com/four-ways-to-quantify-synchrony-between-time-series-data-b99136c4a9c9)

We calculate:
- overall synchrony between r0 and Cn2
- local synchrony between r0 and Cn2

#### Overall Synchrony

In [None]:
    # r, p = print_pearsonr(df_subset.loc[train&valid,label], df_subset.loc[train&valid,'logCn2'])
# plot_overall_synchrony(df_subset.loc[train&valid,label], df_subset.loc[train&valid,['logCn2']], label, 'logCn2', r)

#### Local Synchrony

In [None]:
# plot_local_synchrony(df_subset.loc[train&valid,label], df_subset.loc[train&valid,['logCn2']], label, 'Cn2')

## Synchrony using only R0 daytime data

#### Overall Synchrony

In [None]:
# r, p = print_pearsonr(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,'logCn2'])
# r

In [None]:
# plot_overall_synchrony(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,['logCn2']], label_day, 'logCn2', r)

#### Local Synchrony

In [None]:
# plot_local_synchrony(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,['logCn2']], label_day, 'logCn2')

## Try comparing results without CN2

In [None]:
# results_all_15T_nocn2 = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats_minus_cn2, 'r0_15T')

In [None]:
# results_day_15T_nocn2 = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats_minus_cn2, 'r0_day_15T')

In [None]:
# results_night_15T_nocn2 = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats_minus_cn2, 'r0_night_15T')

In [None]:
# for r in [ results_all_15T, results_all_15T_nocn2 ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# for r in [ results_day_15T, results_day_15T_nocn2 ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# for r in [ results_night_15T, results_night_15T_nocn2 ]:
#     print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# plot_importance(results_all_15T['forest'], df_subset.loc[train & valid, feats], feats)

In [None]:
# test_truth_night_15T = df_subset.loc[test&valid_night,'r0_night_15T']
# test_pred_night_15T = results_night_10T['preds']
# error_by_r0_histograms(test_truth_night_15T, error_perc(test_truth_night_15T, test_pred_night_15T), 0, 80)
# scatter_with_errors(test_truth_night_15T, test_pred_night_15T, error_perc, xmin, xmax, ymin, ymax)

## Why R2 so weird?

In [None]:
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_2T'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_5T'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_10T'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_15T'])))
# print("{:.5}".format(np.var(df_subset.loc[test&valid_night,'r0_night_20T'])))

## Plot One Day

In [None]:
# def plot_one_day(preds, smooth_df, orig_df, startdate, enddate):

#     preds_df = pd.DataFrame(preds, index=smooth_df.index) 

#     daymask = (orig_df.index > startdate ) & (orig_df.index < enddate)

#     plt.figure(figsize=(20, 5))
#     plt.plot(orig_df[daymask], 'g.', label='r0')
#     plt.plot(smooth_df[daymask], 'b.', label='r0 smoothed (15min)')
#     plt.plot(preds_df[daymask], 'r.', label='preds')
#     plt.ylabel('r0')
#     plt.legend()
#     return

In [None]:
# plot_one_day(results_all_15T['preds'], df_subset.loc[test & valid, 'r0_15T'], df_subset.loc[test & valid, 'r0'], '2020-05-30 04:00', '2020-05-31')


In [None]:
# plot_one_day(results_all_15T['preds'], df_subset.loc[test & valid, 'r0_15T'], df_subset.loc[test & valid, 'r0'], '2020-08-30 04:00', '2020-08-31 00:00')

In [None]:
# plot_one_day(results_all_15T['preds'], df_subset.loc[test & valid, 'r0_15T'], df_subset.loc[test & valid, 'r0'], '2020-02-15 04:00', '2020-02-16 00:00')

In [None]:
# plot_one_day(results_all_15T['preds'], df_subset.loc[test & valid, 'r0_15T'], df_subset.loc[test & valid, 'r0'], '2020-11-15 04:00', '2020-11-16 00:00')