In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import statsmodels.api as sm

import pickle
import random

rng = np.random.default_rng(926334)

## Perform Linear Regression to determine best predictor of Actual RMSE

In [71]:
path = "C:/Users/Matt/Dropbox/SnowComp/FinalData/validation_table.csv"
df = pd.read_csv(path).replace('-', np.NaN)
df.iloc[:,1:6] = df.iloc[:,1:6].apply(pd.to_numeric)
df

Unnamed: 0,Date/Sub#,RMSE Train,RMSE Test,RMSE Holdout,RMSE CV,Stratified RMSE CV,Actual RMSEs,Model Description
0,1/10/2022 - 1,11.2054,7.8364,,11.2108,11.4191,10.7437,"RF, all data, fuzz=0.3, lat lon + day of season"
1,1/17/2022,11.406,7.919,8.243,10.3241,,10.3993,"By region, fuzz, no holdout, quadratic?"
2,1/18/2022 - 1,12.645,9.144,,11.4126,,11.3226,1/17 but no ground truth - CV calibration
3,1/18/2022 - 2,11.4794,8.7318,,11.51,11.56,10.7002,"RF, all data, state dummies + day of season"
4,1/19/2022 - 1,11.487,8.4292,,11.51,11.56,9.7753,"RF, all data including test, state dummies + d..."
5,1/29/2022 - 1,15.3637,,,15.3644,15.3854,11.8648,"RF, only data with MODIS imagery, state dummie..."
6,1/30/2022 - 2,13.1548,,,13.5582,13.5787,9.1539,"RF, only data with MODIS imagery, state dummie..."
7,2/04/2022 - 1,9.5228,8.2486,,9.7126,9.7573,8.3817,"RF, all data, state dummies + MODIS prediction..."
8,2/07/2022 - 1,9.0158,8.1114,,9.4568,9.4604,8.0771,"RF, all data, day of season + MODIS prediction..."
9,2/09/2022 - 1,7.4634,5.0854,,7.5993,7.6044,10.9255,"RF, all data, day of season + MODIS prediction..."


In [72]:
#replace NaNs with means
df['RMSE Test'].fillna(df['RMSE Test'].mean(), inplace = True)
df['Stratified RMSE CV'].fillna(df['Stratified RMSE CV'].mean(), inplace = True)

In [73]:
y = df['Actual RMSEs']
X = df[['RMSE Train', 'RMSE Test', 'RMSE CV', 'Stratified RMSE CV']]
X = sm.add_constant(X)

In [74]:
model = sm.OLS(y,X)
results = model.fit()
results.summary()



0,1,2,3
Dep. Variable:,Actual RMSEs,R-squared:,0.687
Model:,OLS,Adj. R-squared:,0.53
Method:,Least Squares,F-statistic:,4.385
Date:,"Mon, 30 May 2022",Prob (F-statistic):,0.0361
Time:,11:57:28,Log-Likelihood:,-14.207
No. Observations:,13,AIC:,38.41
Df Residuals:,8,BIC:,41.24
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,12.3352,1.924,6.412,0.000,7.899,16.772
RMSE Train,1.8132,0.544,3.333,0.010,0.559,3.068
RMSE Test,-0.8594,0.279,-3.083,0.015,-1.502,-0.217
RMSE CV,-0.4967,1.258,-0.395,0.703,-3.398,2.405
Stratified RMSE CV,-0.9348,1.167,-0.801,0.446,-3.626,1.757

0,1,2,3
Omnibus:,1.488,Durbin-Watson:,1.626
Prob(Omnibus):,0.475,Jarque-Bera (JB):,0.674
Skew:,0.555,Prob(JB):,0.714
Kurtosis:,2.881,Cond. No.,160.0


In [75]:
model2 = sm.OLS(y,X)
res_lasso = model2.fit_regularized(alpha=.05, L1_wt=.8)
# res_lasso.summary()
# sm.regression.linear_model.OLSResults(res_lasso, )
res_lasso.params

const                 6.561946
RMSE Train            0.403437
RMSE Test            -0.243810
RMSE CV               0.061579
Stratified RMSE CV    0.000000
dtype: float64

In [78]:
for i, table in enumerate(results.summary().tables):
    print(table)


                            OLS Regression Results                            
Dep. Variable:           Actual RMSEs   R-squared:                       0.687
Model:                            OLS   Adj. R-squared:                  0.530
Method:                 Least Squares   F-statistic:                     4.385
Date:                Mon, 30 May 2022   Prob (F-statistic):             0.0361
Time:                        11:58:55   Log-Likelihood:                -14.207
No. Observations:                  13   AIC:                             38.41
Df Residuals:                       8   BIC:                             41.24
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 12.3352      1



In [79]:
print(results.summary().as_latex())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}     &   Actual RMSEs   & \textbf{  R-squared:         } &     0.687   \\
\textbf{Model:}             &       OLS        & \textbf{  Adj. R-squared:    } &     0.530   \\
\textbf{Method:}            &  Least Squares   & \textbf{  F-statistic:       } &     4.385   \\
\textbf{Date:}              & Mon, 30 May 2022 & \textbf{  Prob (F-statistic):} &   0.0361    \\
\textbf{Time:}              &     11:59:32     & \textbf{  Log-Likelihood:    } &   -14.207   \\
\textbf{No. Observations:}  &          13      & \textbf{  AIC:               } &     38.41   \\
\textbf{Df Residuals:}      &           8      & \textbf{  BIC:               } &     41.24   \\
\textbf{Df Model:}          &           4      & \textbf{                     } &             \\
\textbf{Covariance Type:}   &    nonrobust     & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{lcccccc}
                            & 

In [80]:
for table in results.summary().tables:
    print(table.as_latex_tabular())

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}    &   Actual RMSEs   & \textbf{  R-squared:         } &    0.687  \\
\textbf{Model:}            &       OLS        & \textbf{  Adj. R-squared:    } &    0.530  \\
\textbf{Method:}           &  Least Squares   & \textbf{  F-statistic:       } &    4.385  \\
\textbf{Date:}             & Mon, 30 May 2022 & \textbf{  Prob (F-statistic):} &  0.0361   \\
\textbf{Time:}             &     11:59:53     & \textbf{  Log-Likelihood:    } &  -14.207  \\
\textbf{No. Observations:} &          13      & \textbf{  AIC:               } &    38.41  \\
\textbf{Df Residuals:}     &           8      & \textbf{  BIC:               } &    41.24  \\
\textbf{Df Model:}         &           4      & \textbf{                     } &           \\
\textbf{Covariance Type:}  &    nonrobust     & \textbf{                     } &           \\
\bottomrule
\end{tabular}
%\caption{OLS Regression Results}
\end{center}
\begin{center}
\begin{tabular}{lccc