### import statements

In [1]:
%matplotlib inline
import sys
import os
import pandas as pd
sys.path.append('../src')
import datetime
import matplotlib.pyplot as plt
import numpy as np
import sklearn

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from scipy.stats import gaussian_kde
import seaborn as sns
import scipy.stats as stats
from mpl_toolkits.axes_grid1 import make_axes_locatable
from plot_utils import *

### constants

In [3]:
F_INPUT = '../../data/v2/data.h5'

### read data

In [4]:
df = pd.read_hdf(F_INPUT, 'merged')

### add new features

In [5]:
df['day'] = df.index.dayofyear
df['logCn2'] = np.log10(df['Cn2'])

In [6]:
df['r0_2T'] = df['r0'].rolling('2T').mean()
df['r0_day_2T'] = df['r0_day'].rolling('2T').mean()
df['r0_night_2T'] = df['r0_night'].rolling('2T').mean()

In [7]:
df['r0_3T'] = df['r0'].rolling('3T').mean()
df['r0_day_3T'] = df['r0_day'].rolling('3T').mean()
df['r0_night_3T'] = df['r0_night'].rolling('3T').mean()

In [8]:
df['r0_5T'] = df['r0'].rolling('5T').mean()
df['r0_day_5T'] = df['r0_day'].rolling('5T').mean()
df['r0_night_5T'] = df['r0_night'].rolling('5T').mean()

In [70]:
df['r0_10T'] = df['r0'].rolling('10T').mean()
df['r0_day_10T'] = df['r0_day'].rolling('10T').mean()
df['r0_night_10T'] = df['r0_night'].rolling('10T').mean()

In [72]:
feats = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'logCn2', 'solar_zenith_angle','day']
label_day = 'r0_day'
label = 'r0'
label_night = 'r0_night'
feats_plus_r0 = feats + ['r0']
feats_plus_r0day = feats + ['r0_day']
feats_plus_r0night = feats + ['r0_night']

In [73]:
xmin, ymin = 0, 0
xmax, ymax = 20, 20

### restricting data to usable, relatively dense subset

In [74]:
df_subset = df[(df.index > '2018-05-03') & (df.index < '2020-12-30')]

In [75]:
df_subset.describe()

Unnamed: 0,o(I)_I,r0,solar_zenith_angle,r0_day,r0_night,pressure,relative_humidity,temperature,wind_direction,wind_speed,...,r0_night_2T,r0_3T,r0_day_3T,r0_night_3T,r0_5T,r0_day_5T,r0_night_5T,r0_10T,r0_day_10T,r0_night_10T
count,273728.0,413091.0,413091.0,273728.0,139363.0,1252046.0,1164787.0,1164792.0,1280385.0,1280461.0,...,228135.0,537455.0,295598.0,241857.0,563929.0,309085.0,254844.0,598586.0,331070.0,267516.0
mean,0.001382,4.87189,74.69819,3.607425,7.355473,774.4021,35.32743,11.04439,174.7172,2.327763,...,7.357703,5.298633,3.614953,7.356429,5.30628,3.620901,7.350375,5.288679,3.633298,7.33733
std,0.000448,2.995811,43.19205,2.357307,2.525811,4.687234,24.06511,7.993629,67.63609,1.770945,...,2.516584,3.117358,2.500932,2.500309,3.135924,2.567808,2.478042,3.153837,2.647643,2.448015
min,0.000102,0.821452,10.943276,0.821452,2.3674,751.6,1.0,-13.0,2.05,0.035,...,2.3674,0.821452,0.821452,2.3674,0.821452,0.821452,2.3674,0.821452,0.821452,2.3674
25%,0.001066,2.599046,42.089154,2.273231,5.5091,771.8,19.0,5.2,116.3684,1.163158,...,5.5269,2.796983,2.297053,5.5358,2.79465,2.301819,5.5447,2.77462,2.302938,5.557414
50%,0.00136,3.9694,58.682654,2.96627,7.0666,775.1,29.0,11.1,183.55,1.826316,...,7.0755,4.6102,2.952146,7.07995,4.62227,2.943144,7.0844,4.590917,2.927821,7.087367
75%,0.001653,6.553345,118.005308,4.196203,8.9267,777.9,45.0,17.5,209.0526,2.88,...,8.9178,7.19565,4.118894,8.9178,7.2179,4.093722,8.905933,7.21256,4.079838,8.889617
max,0.0035,86.039498,169.053912,86.039498,24.8577,783.8,110.0,31.8,360.0,31.96667,...,24.8577,86.039498,86.039498,24.8577,86.039498,86.039498,24.8577,86.039498,86.039498,22.07645


### finding non-nan values

In [77]:
valid = ~df_subset[feats_plus_r0].isnull().any(axis=1)

In [78]:
df_subset.loc[valid, feats_plus_r0].count()

pressure              350540
relative_humidity     350540
temperature           350540
wind_speed            350540
logCn2                350540
solar_zenith_angle    350540
day                   350540
r0                    350540
dtype: int64

In [79]:
valid_day = ~df_subset[feats_plus_r0day].isnull().any(axis=1)

In [80]:
df_subset.loc[valid_day, feats_plus_r0day].count()

pressure              227118
relative_humidity     227118
temperature           227118
wind_speed            227118
logCn2                227118
solar_zenith_angle    227118
day                   227118
r0_day                227118
dtype: int64

In [81]:
valid_night = ~df_subset[feats_plus_r0night].isnull().any(axis=1)

In [82]:
df_subset.loc[valid_night,feats_plus_r0night].count()

pressure              123422
relative_humidity     123422
temperature           123422
wind_speed            123422
logCn2                123422
solar_zenith_angle    123422
day                   123422
r0_night              123422
dtype: int64

In [87]:
df_subset.loc[valid_night,feats + ['r0_night_10T']].count()

pressure              123422
relative_humidity     123422
temperature           123422
wind_speed            123422
logCn2                123422
solar_zenith_angle    123422
day                   123422
r0_night_10T          123422
dtype: int64

In [88]:
df_subset.loc[valid_day,feats + ['r0_day_10T']].count()

pressure              227118
relative_humidity     227118
temperature           227118
wind_speed            227118
logCn2                227118
solar_zenith_angle    227118
day                   227118
r0_day_10T            227118
dtype: int64

In [89]:
df_subset.loc[valid,feats + ['r0_10T']].count()

pressure              350540
relative_humidity     350540
temperature           350540
wind_speed            350540
logCn2                350540
solar_zenith_angle    350540
day                   350540
r0_10T                350540
dtype: int64

### splitting into train and test

In [90]:
split_date = '2019-12-31'
train = df_subset.index <= split_date
test  = df_subset.index > split_date

In [91]:
test_truth_night = df_subset.loc[test&valid_night,label_night]
test_truth_day = df_subset.loc[test&valid_day,label_day]
test_truth_all = df_subset.loc[test&valid,label]

In [92]:
df_subset.loc[train&valid,feats_plus_r0].count()

pressure              251681
relative_humidity     251681
temperature           251681
wind_speed            251681
logCn2                251681
solar_zenith_angle    251681
day                   251681
r0                    251681
dtype: int64

In [93]:
df_subset.loc[test&valid,feats_plus_r0].count()

pressure              98859
relative_humidity     98859
temperature           98859
wind_speed            98859
logCn2                98859
solar_zenith_angle    98859
day                   98859
r0                    98859
dtype: int64

In [94]:
df_subset.loc[train&valid_day,feats_plus_r0day].count()

pressure              174044
relative_humidity     174044
temperature           174044
wind_speed            174044
logCn2                174044
solar_zenith_angle    174044
day                   174044
r0_day                174044
dtype: int64

In [95]:
df_subset.loc[test&valid_day,feats_plus_r0day].count()

pressure              53074
relative_humidity     53074
temperature           53074
wind_speed            53074
logCn2                53074
solar_zenith_angle    53074
day                   53074
r0_day                53074
dtype: int64

In [96]:
df_subset.loc[train&valid_night,feats_plus_r0night].count()

pressure              77637
relative_humidity     77637
temperature           77637
wind_speed            77637
logCn2                77637
solar_zenith_angle    77637
day                   77637
r0_night              77637
dtype: int64

In [97]:
df_subset.loc[test&valid_night,feats_plus_r0night].count()

pressure              45785
relative_humidity     45785
temperature           45785
wind_speed            45785
logCn2                45785
solar_zenith_angle    45785
day                   45785
r0_night              45785
dtype: int64

### initializing the RF regressor

In [26]:
regr = RandomForestRegressor(n_estimators=100, random_state=0)

### train and test subroutine

In [41]:
def train_and_test(train_df, test_df, feats, label):
    regr.fit(train_df[feats], train_df[label])
#     r2 = regr.score(test_df[feats], test_df[label])
    preds = regr.predict(test_df[feats])
    r2 = r2_score(test_df[label], preds)
    sq_err = mean_squared_error(test_df[label], preds)
    perc_err = mean_absolute_percentage_error(test_df[label], preds)
    return {'preds': preds, 'r2': r2, 'sq_err': sq_err, 'perc_err': perc_err}

### Get All Results

In [54]:
results_all_1T = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, label)

In [None]:
# scatter_with_errors(test_truth, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [59]:
results_day_1T = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats, label_day)

In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [60]:
results_night_1T = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats, label_night)

In [47]:
results_all_2T = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, 'r0_2T')

In [48]:
results_day_2T = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats, 'r0_day_2T')

In [49]:
results_night_2T = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats, 'r0_night_2T')

In [50]:
results_all_5T = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, 'r0_5T')

In [51]:
results_day_5T = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats, 'r0_day_5T')

In [52]:
results_night_5T = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats, 'r0_night_5T')

In [98]:
results_all_10T = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats, 'r0_10T')

In [99]:
results_day_10T = train_and_test(df_subset.loc[train & valid_day], df_subset.loc[test & valid_day], feats, 'r0_day_10T')

In [100]:
results_night_10T = train_and_test(df_subset.loc[train & valid_night], df_subset.loc[test & valid_night], feats, 'r0_night_10T')

### Compare all Results

In [101]:
for r in [ results_all_1T, results_all_2T, results_all_5T, results_all_10T ]:
    print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

0.34246,6.1893,0.30759
0.34541,5.9972,0.29592
0.35381,5.6623,0.28418
0.36844,5.3812,0.27501


In [102]:
for r in [ results_day_1T, results_day_2T, results_day_5T, results_day_10T ]:
    print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

0.17369,6.9019,0.32359
0.16998,6.6112,0.30191
0.13605,6.5074,0.28984
0.17363,6.0437,0.27704


In [103]:
for r in [ results_night_1T, results_night_2T, results_night_5T, results_night_10T ]:
    print(f"{r['r2']:.5},{r['sq_err']:.5},{r['perc_err']:.5}")

0.030069,5.2994,0.2799
0.025733,5.2473,0.27779
0.0047086,5.0569,0.2711
0.019509,4.7986,0.26401


In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

In [None]:
# scatter_with_errors(test_truth_day, test_preds_day, test_perc_err_day, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_day,feats], feats)

### Plotting

#### scatter plots of actual vs. predict using error_diff

In [32]:
# %matplotlib inline
# scatter_with_errors(test_truth_all, test_preds_all, error_perc, xmin, xmax, ymin, ymax)

# scatter_with_errors(test_truth_night, test_preds_night, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_night,feats], feats)

# scatter_with_errors(test_truth_night, test_preds_night, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid_night,feats], feats)

#### interactive time domain plot of errors

switching matplotlib to notebook mode to enable a zoom-in of different portions of the time axis

In [33]:
# %matplotlib notebook 
# plot_errors_in_time(test_truth_all, test_preds_all)

#### feature importance

feature importance from the model

In [None]:
# %matplotlib inline

In [34]:
# plot_importance(regr, df_subset.loc[train&valid,feats], feats)

## debug why CN2 is so low

#### What happens if I drop month and SZA

Answer: turns out we had to take the log of CN2

In [35]:
# feats_no_sza = ['pressure', 'relative_humidity', 'temperature', 'wind_speed', 'logCn2']
# preds_all_no_sza, r2_all_no_sza = train_and_test(df_subset.loc[train & valid], df_subset.loc[test & valid], feats_no_sza, label)
# scatter_with_errors(test_truth_all, preds_all_no_sza, error_perc, xmin, xmax, ymin, ymax)
# plot_importance(regr, df_subset.loc[train&valid,feats_no_sza], feats_no_sza)
# r2_all_no_sza

#### correlation between the signals using [stats.pearsonr](https://towardsdatascience.com/four-ways-to-quantify-synchrony-between-time-series-data-b99136c4a9c9)

We calculate:
- overall synchrony between r0 and Cn2
- local synchrony between r0 and Cn2

#### Overall Synchrony

In [36]:
    # r, p = print_pearsonr(df_subset.loc[train&valid,label], df_subset.loc[train&valid,'logCn2'])
# plot_overall_synchrony(df_subset.loc[train&valid,label], df_subset.loc[train&valid,['logCn2']], label, 'logCn2', r)

#### Local Synchrony

In [37]:
# plot_local_synchrony(df_subset.loc[train&valid,label], df_subset.loc[train&valid,['logCn2']], label, 'Cn2')

## Synchrony using only R0 daytime data

#### Overall Synchrony

In [38]:
# r, p = print_pearsonr(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,'logCn2'])
# r

In [39]:
# plot_overall_synchrony(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,['logCn2']], label_day, 'logCn2', r)

#### Local Synchrony

In [40]:
# plot_local_synchrony(df_subset.loc[train&valid_day,label_day], df_subset.loc[train&valid_day,['logCn2']], label_day, 'logCn2')

## Performance Histograms by Magnitude

In [None]:
error_by_r0_histograms(test_truth_all, error_perc(test_truth_all, test_preds_all), 0, 80)
error_by_r0_histograms(test_truth_day, error_perc(test_truth_day, test_preds_day), 0, 80)
error_by_r0_histograms(test_truth_night, error_perc(test_truth_night, test_preds_night), 0, 80)

## Smoothed r0 experiments