### import statements

In [None]:
%matplotlib inline
import sys
import os
import pandas as pd
sys.path.append('../src')
import datetime
import matplotlib.pyplot as plt
import numpy as np
import sklearn

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from plot_utils import *

### constants

In [None]:
F_INPUT = '../../data/v2/data.h5'

### read data

In [None]:
df = pd.read_hdf(F_INPUT, 'merged')

### add new features

In [None]:
df['day'] = df.index.dayofyear
df['logCn2'] = np.log10(df['Cn2'])

In [None]:
df['r0_2T'] = df['r0'].rolling('2T').mean()
df['r0_day_2T'] = df['r0_day'].rolling('2T').mean()
df['r0_night_2T'] = df['r0_night'].rolling('2T').mean()

In [None]:
df['r0_3T'] = df['r0'].rolling('3T').mean()
df['r0_day_3T'] = df['r0_day'].rolling('3T').mean()
df['r0_night_3T'] = df['r0_night'].rolling('3T').mean()

In [None]:
df['r0_5T'] = df['r0'].rolling('5T').mean()
df['r0_day_5T'] = df['r0_day'].rolling('5T').mean()
df['r0_night_5T'] = df['r0_night'].rolling('5T').mean()

In [None]:
df['r0_10T'] = df['r0'].rolling('10T').mean()
df['r0_day_10T'] = df['r0_day'].rolling('10T').mean()
df['r0_night_10T'] = df['r0_night'].rolling('10T').mean()

In [None]:
feats = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'logCn2', 'solar_zenith_angle','day']
label_day = 'r0_day'
label = 'r0'
label_night = 'r0_night'
feats_plus_r0 = feats + ['r0']
feats_plus_r0day = feats + ['r0_day']
feats_plus_r0night = feats + ['r0_night']

In [None]:
xmin, ymin = 0, 0
xmax, ymax = 20, 20

### restricting data to usable, relatively dense subset

In [None]:
df_subset = df[(df.index > '2018-05-03') & (df.index < '2020-12-30')]

In [None]:
df_subset.describe()

### finding non-nan values

In [None]:
valid = ~df_subset[feats_plus_r0].isnull().any(axis=1)

In [None]:
df_subset.loc[valid, feats_plus_r0].count()

In [None]:
valid_day = ~df_subset[feats_plus_r0day].isnull().any(axis=1)

In [None]:
df_subset.loc[valid_day, feats_plus_r0day].count()

In [None]:
valid_night = ~df_subset[feats_plus_r0night].isnull().any(axis=1)

In [None]:
df_subset.loc[valid_night,feats_plus_r0night].count()

In [None]:
df_subset.loc[valid_night,feats + ['r0_night_10T']].count()

In [None]:
df_subset.loc[valid_day,feats + ['r0_day_10T']].count()

In [None]:
df_subset.loc[valid,feats + ['r0_10T']].count()

### splitting into train and test

In [None]:
split_date = '2019-12-31'
train = df_subset.index <= split_date
test  = df_subset.index > split_date

In [None]:
test_truth_night = df_subset.loc[test&valid_night,label_night]
test_truth_day = df_subset.loc[test&valid_day,label_day]
test_truth_all = df_subset.loc[test&valid,label]

In [None]:
df_subset.loc[train&valid,feats_plus_r0].count()

In [None]:
df_subset.loc[test&valid,feats_plus_r0].count()

In [None]:
df_subset.loc[train&valid_day,feats_plus_r0day].count()

In [None]:
df_subset.loc[test&valid_day,feats_plus_r0day].count()

In [None]:
df_subset.loc[train&valid_night,feats_plus_r0night].count()

In [None]:
df_subset.loc[test&valid_night,feats_plus_r0night].count()

### initializing the RF regressor

In [None]:
regr = RandomForestRegressor(n_estimators=100, random_state=0)

### train and test subroutine

In [None]:
def train_and_test(train_df, test_df, feats, label):
    regr.fit(train_df[feats], train_df[label])
#     r2 = regr.score(test_df[feats], test_df[label])
    preds = regr.predict(test_df[feats])
    r2 = r2_score(test_df[label], preds)
    sq_err = mean_squared_error(test_df[label], preds)
    perc_err = mean_absolute_percentage_error(test_df[label], preds)
    return {'preds': preds, 'r2': r2, 'sq_err': sq_err, 'perc_err': perc_err}

### Get All Results

In [None]:
results_all_1T = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, label)

In [None]:
# scatter_with_errors(test_truth, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [None]:
results_day_1T = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats, label_day)

In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [None]:
results_night_1T = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats, label_night)

In [None]:
results_all_2T = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, 'r0_2T')

In [None]:
results_day_2T = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats, 'r0_day_2T')

In [None]:
results_night_2T = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats, 'r0_night_2T')

In [None]:
results_all_5T = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, 'r0_5T')

In [None]:
results_day_5T = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats, 'r0_day_5T')

In [None]:
results_night_5T = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats, 'r0_night_5T')

In [None]:
results_all_10T = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, 'r0_10T')

In [None]:
results_day_10T = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats, 'r0_day_10T')

In [None]:
results_night_10T = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats, 'r0_night_10T')

### Compare all Results

In [None]:
for r in [ results_all_1T, results_all_2T, results_all_5T, results_all_10T ]:
    print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
for r in [ results_day_1T, results_day_2T, results_day_5T, results_day_10T ]:
    print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
for r in [ results_night_1T, results_night_2T, results_night_5T, results_night_10T ]:
    print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [None]:
# error_by_r0_histograms(test_truth_all, error_perc(test_truth_all, test_preds_all), 0, 80)
# error_by_r0_histograms(test_truth_day, error_perc(test_truth_day, test_preds_day), 0, 80)
# error_by_r0_histograms(test_truth_night, error_perc(test_truth_night, test_preds_night), 0, 80)

In [None]:
test_truth_night = df_subset.loc[test&valid_night,label_night]
test_truth_day = df_subset.loc[test&valid_day,label_day]
test_truth_all = df_subset.loc[test&valid,label]

In [None]:
test_truth_night_10T = df_subset.loc[test&valid_night,'r0_night_10T']
test_pred_night_10T = results_night_10T['preds']
error_by_r0_histograms(test_truth_night_10T, error_perc(test_truth_night_10T, test_pred_night_10T), 0, 80)
scatter_with_errors(test_truth_night_10T, test_pred_night_10T, error_perc, xmin, xmax, ymin, ymax)


In [None]:
test_truth_day_10T = df_subset.loc[test&valid_day,'r0_day_10T']
test_pred_day_10T = results_day_10T['preds']
error_by_r0_histograms(test_truth_day_10T, error_perc(test_truth_day_10T, test_pred_day_10T), 0, 90)
scatter_with_errors(test_truth_day_10T, test_pred_day_10T, error_perc, xmin, xmax, ymin, ymax)

### Plotting

#### scatter plots of actual vs. predict using error_diff

In [None]:
# %matplotlib inline
# scatter_with_errors(test_truth_all, test_preds_all, error_perc, xmin, xmax, ymin, ymax)

# scatter_with_errors(test_truth_night, test_preds_night, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_night,feats], feats)

# scatter_with_errors(test_truth_night, test_preds_night, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_night,feats], feats)

#### interactive time domain plot of errors

switching matplotlib to notebook mode to enable a zoom-in of different portions of the time axis

In [None]:
# %matplotlib notebook 
# plot_errors_in_time(test_truth_all, test_preds_all)

#### feature importance

feature importance from the model

In [None]:
# %matplotlib inline

In [None]:
# plot_importance(regr, df_subset.loc[train&valid,feats], feats)

## debug why CN2 is so low

#### What happens if I drop month and SZA

Answer: turns out we had to take the log of CN2

In [None]:
# feats_no_sza = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'logCn2']
# preds_all_no_sza, r2_all_no_sza = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats_no_sza, label)
# scatter_with_errors(test_truth_all, preds_all_no_sza, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid,feats_no_sza], feats_no_sza)
# r2_all_no_sza

#### correlation between the signals using [stats.pearsonr](https://towardsdatascience.com/four-ways-to-quantify-synchrony-between-time-series-data-b99136c4a9c9)

We calculate:
- overall synchrony between r0 and Cn2
- local synchrony between r0 and Cn2

#### Overall Synchrony

In [None]:
    # r, p = print_pearsonr(df_subset.loc[train&valid,label], df_subset.loc[train&valid,'logCn2'])
# plot_overall_synchrony(df_subset.loc[train&valid,label], df_subset.loc[train&valid,['logCn2']], label, 'logCn2', r)

#### Local Synchrony

In [None]:
# plot_local_synchrony(df_subset.loc[train&valid,label], df_subset.loc[train&valid,['logCn2']], label, 'Cn2')

## Synchrony using only R0 daytime data

#### Overall Synchrony

In [None]:
# r, p = print_pearsonr(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,'logCn2'])
# r

In [None]:
# plot_overall_synchrony(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,['logCn2']], label_day, 'logCn2', r)

#### Local Synchrony

In [None]:
# plot_local_synchrony(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,['logCn2']], label_day, 'logCn2')

## Performance Histograms by Magnitude

## Smoothed r0 experiments