In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import statsmodels.api as sm
import statsmodels.formula.api as smf 

import pickle
import random

rng = np.random.default_rng(926334)

## Perform Linear Regression to determine best predictor of Actual RMSE

In [71]:
path = "C:/Users/Matt/Dropbox/SnowComp/FinalData/validation_table.csv"
df = pd.read_csv(path).replace('-', np.NaN)
df.iloc[:,1:6] = df.iloc[:,1:6].apply(pd.to_numeric)
df

Unnamed: 0,Date/Sub#,RMSE Train,RMSE Test,RMSE Holdout,RMSE CV,Stratified RMSE CV,Actual RMSEs,Model Description
0,1/10/2022 - 1,11.2054,7.8364,,11.2108,11.4191,10.7437,"RF, all data, fuzz=0.3, lat lon + day of season"
1,1/17/2022,11.406,7.919,8.243,10.3241,,10.3993,"By region, fuzz, no holdout, quadratic?"
2,1/18/2022 - 1,12.645,9.144,,11.4126,,11.3226,1/17 but no ground truth - CV calibration
3,1/18/2022 - 2,11.4794,8.7318,,11.51,11.56,10.7002,"RF, all data, state dummies + day of season"
4,1/19/2022 - 1,11.487,8.4292,,11.51,11.56,9.7753,"RF, all data including test, state dummies + d..."
5,1/29/2022 - 1,15.3637,,,15.3644,15.3854,11.8648,"RF, only data with MODIS imagery, state dummie..."
6,1/30/2022 - 2,13.1548,,,13.5582,13.5787,9.1539,"RF, only data with MODIS imagery, state dummie..."
7,2/04/2022 - 1,9.5228,8.2486,,9.7126,9.7573,8.3817,"RF, all data, state dummies + MODIS prediction..."
8,2/07/2022 - 1,9.0158,8.1114,,9.4568,9.4604,8.0771,"RF, all data, day of season + MODIS prediction..."
9,2/09/2022 - 1,7.4634,5.0854,,7.5993,7.6044,10.9255,"RF, all data, day of season + MODIS prediction..."


In [72]:
#replace NaNs with means
df['RMSE Test'].fillna(df['RMSE Test'].mean(), inplace = True)
df['Stratified RMSE CV'].fillna(df['Stratified RMSE CV'].mean(), inplace = True)

In [73]:
y = df['Actual RMSEs']
X = df[['RMSE Train', 'RMSE Test', 'RMSE CV', 'Stratified RMSE CV']]
X = sm.add_constant(X)

In [74]:
model = sm.OLS(y,X)
results = model.fit()
results.summary()



0,1,2,3
Dep. Variable:,Actual RMSEs,R-squared:,0.687
Model:,OLS,Adj. R-squared:,0.53
Method:,Least Squares,F-statistic:,4.385
Date:,"Mon, 30 May 2022",Prob (F-statistic):,0.0361
Time:,11:57:28,Log-Likelihood:,-14.207
No. Observations:,13,AIC:,38.41
Df Residuals:,8,BIC:,41.24
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,12.3352,1.924,6.412,0.000,7.899,16.772
RMSE Train,1.8132,0.544,3.333,0.010,0.559,3.068
RMSE Test,-0.8594,0.279,-3.083,0.015,-1.502,-0.217
RMSE CV,-0.4967,1.258,-0.395,0.703,-3.398,2.405
Stratified RMSE CV,-0.9348,1.167,-0.801,0.446,-3.626,1.757

0,1,2,3
Omnibus:,1.488,Durbin-Watson:,1.626
Prob(Omnibus):,0.475,Jarque-Bera (JB):,0.674
Skew:,0.555,Prob(JB):,0.714
Kurtosis:,2.881,Cond. No.,160.0


In [75]:
model2 = sm.OLS(y,X)
res_lasso = model2.fit_regularized(alpha=.05, L1_wt=.8)
# res_lasso.summary()
# sm.regression.linear_model.OLSResults(res_lasso, )
res_lasso.params

const                 6.561946
RMSE Train            0.403437
RMSE Test            -0.243810
RMSE CV               0.061579
Stratified RMSE CV    0.000000
dtype: float64

In [78]:
for i, table in enumerate(results.summary().tables):
    print(table)


                            OLS Regression Results                            
Dep. Variable:           Actual RMSEs   R-squared:                       0.687
Model:                            OLS   Adj. R-squared:                  0.530
Method:                 Least Squares   F-statistic:                     4.385
Date:                Mon, 30 May 2022   Prob (F-statistic):             0.0361
Time:                        11:58:55   Log-Likelihood:                -14.207
No. Observations:                  13   AIC:                             38.41
Df Residuals:                       8   BIC:                             41.24
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 12.3352      1



In [79]:
print(results.summary().as_latex())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}     &   Actual RMSEs   & \textbf{  R-squared:         } &     0.687   \\
\textbf{Model:}             &       OLS        & \textbf{  Adj. R-squared:    } &     0.530   \\
\textbf{Method:}            &  Least Squares   & \textbf{  F-statistic:       } &     4.385   \\
\textbf{Date:}              & Mon, 30 May 2022 & \textbf{  Prob (F-statistic):} &   0.0361    \\
\textbf{Time:}              &     11:59:32     & \textbf{  Log-Likelihood:    } &   -14.207   \\
\textbf{No. Observations:}  &          13      & \textbf{  AIC:               } &     38.41   \\
\textbf{Df Residuals:}      &           8      & \textbf{  BIC:               } &     41.24   \\
\textbf{Df Model:}          &           4      & \textbf{                     } &             \\
\textbf{Covariance Type:}   &    nonrobust     & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{lcccccc}
                            & 

In [80]:
for table in results.summary().tables:
    print(table.as_latex_tabular())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}    &   Actual RMSEs   & \textbf{  R-squared:         } &    0.687  \\
\textbf{Model:}            &       OLS        & \textbf{  Adj. R-squared:    } &    0.530  \\
\textbf{Method:}           &  Least Squares   & \textbf{  F-statistic:       } &    4.385  \\
\textbf{Date:}             & Mon, 30 May 2022 & \textbf{  Prob (F-statistic):} &  0.0361   \\
\textbf{Time:}             &     11:59:53     & \textbf{  Log-Likelihood:    } &  -14.207  \\
\textbf{No. Observations:} &          13      & \textbf{  AIC:               } &    38.41  \\
\textbf{Df Residuals:}     &           8      & \textbf{  BIC:               } &    41.24  \\
\textbf{Df Model:}         &           4      & \textbf{                     } &           \\
\textbf{Covariance Type:}  &    nonrobust     & \textbf{                     } &           \\
\bottomrule
\end{tabular}
%\caption{OLS Regression Results}
\end{center}
\begin{center}
\begin{tabular}{lccc

## Interaction Terms

In [91]:
df.rename(columns = {"Actual RMSEs":"Actual_RMSEs", "RMSE Train":"RMSE_Train", "RMSE Test":"RMSE_Test",
                     "RMSE CV":"RMSE_CV","Stratified RMSE CV":"Stratified_RMSE_CV"}, inplace = True)

In [99]:
mod = smf.ols(formula='Actual_RMSEs ~ RMSE_Train + RMSE_Test + RMSE_CV+ Stratified_RMSE_CV + \
              RMSE_Train: RMSE_Test + RMSE_Test:RMSE_CV + RMSE_Test: Stratified_RMSE_CV +  RMSE_CV:Stratified_RMSE_CV' ,
              data=df)
res=mod.fit()
res.summary()



0,1,2,3
Dep. Variable:,Actual_RMSEs,R-squared:,0.77
Model:,OLS,Adj. R-squared:,0.309
Method:,Least Squares,F-statistic:,1.671
Date:,"Mon, 30 May 2022",Prob (F-statistic):,0.326
Time:,12:17:13,Log-Likelihood:,-12.207
No. Observations:,13,AIC:,42.41
Df Residuals:,4,BIC:,47.5
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.3274,29.391,0.079,0.941,-79.275,83.930
RMSE_Train,-38.8971,63.212,-0.615,0.572,-214.402,136.608
RMSE_Test,0.8333,3.496,0.238,0.823,-8.872,10.539
RMSE_CV,181.9878,178.262,1.021,0.365,-312.947,676.923
Stratified_RMSE_CV,-142.2409,126.861,-1.121,0.325,-494.463,209.982
RMSE_Train:RMSE_Test,5.2233,7.720,0.677,0.536,-16.211,26.658
RMSE_Test:RMSE_CV,-22.9809,22.702,-1.012,0.369,-86.013,40.051
RMSE_Test:Stratified_RMSE_CV,17.6365,16.006,1.102,0.332,-26.803,62.076
RMSE_CV:Stratified_RMSE_CV,0.0178,0.123,0.145,0.892,-0.323,0.358

0,1,2,3
Omnibus:,5.953,Durbin-Watson:,1.732
Prob(Omnibus):,0.051,Jarque-Bera (JB):,2.726
Skew:,1.026,Prob(JB):,0.256
Kurtosis:,3.909,Cond. No.,148000.0


In [115]:
mod = smf.ols(formula='Actual_RMSEs ~ RMSE_Train + RMSE_Test + RMSE_CV+ Stratified_RMSE_CV + \
              RMSE_Train: RMSE_Test + RMSE_Test:RMSE_CV + RMSE_Test: Stratified_RMSE_CV +  RMSE_CV:Stratified_RMSE_CV' ,
              data=df)
res_lasso = mod.fit_regularized(alpha=.08, L1_wt=.8)
res_lasso.params

Intercept                       6.031380
RMSE_Train                      0.623653
RMSE_Test                      -0.133226
RMSE_CV                         0.000000
Stratified_RMSE_CV              0.000000
RMSE_Train:RMSE_Test           -0.009262
RMSE_Test:RMSE_CV              -0.014897
RMSE_Test:Stratified_RMSE_CV   -0.001388
RMSE_CV:Stratified_RMSE_CV      0.000729
dtype: float64

In [118]:
mod = smf.ols(formula='Actual_RMSEs ~ RMSE_Train + RMSE_Test + \
              RMSE_Train: RMSE_Test + RMSE_Train:RMSE_CV + RMSE_Train: Stratified_RMSE_CV' ,
              data=df)
res=mod.fit()
res.summary()



0,1,2,3
Dep. Variable:,Actual_RMSEs,R-squared:,0.645
Model:,OLS,Adj. R-squared:,0.392
Method:,Least Squares,F-statistic:,2.547
Date:,"Mon, 30 May 2022",Prob (F-statistic):,0.127
Time:,12:21:40,Log-Likelihood:,-15.016
No. Observations:,13,AIC:,42.03
Df Residuals:,7,BIC:,45.42
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,15.7545,21.748,0.724,0.492,-35.672,67.181
RMSE_Train,0.7317,3.017,0.242,0.815,-6.403,7.867
RMSE_Test,-2.5479,2.343,-1.088,0.313,-8.088,2.992
RMSE_Train:RMSE_Test,0.1623,0.278,0.585,0.577,-0.494,0.819
RMSE_Train:RMSE_CV,-0.0297,0.145,-0.204,0.844,-0.374,0.314
RMSE_Train:Stratified_RMSE_CV,-0.0361,0.166,-0.218,0.833,-0.428,0.355

0,1,2,3
Omnibus:,2.175,Durbin-Watson:,1.643
Prob(Omnibus):,0.337,Jarque-Bera (JB):,0.869
Skew:,-0.631,Prob(JB):,0.647
Kurtosis:,3.112,Cond. No.,15700.0
