##  Optimze and a Dense Neural Network for gap filling and feature identification

** With a few tweaks to RepRunner, an LSTM can be run instead

In [267]:
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
# from functools import partial
from multiprocessing import Pool
from sklearn import metrics
# from sklearn.model_selection import train_test_split

## Personal Modules
import ReadStandardTimeFill as RSTF
import importlib
import DenseNet as Dense
import MiscFuncs as MF
importlib.reload(Dense)
importlib.reload(RSTF)
importlib.reload(MF)

# %matplotlib inline

%matplotlib notebook
%config IPCompleter.greedy=True

from scipy.optimize import minimize, curve_fit
from scipy.stats import norm
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from matplotlib import cm

from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

from ipywidgets import FloatProgress
from IPython.display import display
import os  
try:pool.close()
except:pass


def Test(params,X,y,YScaled,XScaled,pool):
    return(np.random.rand(params['K']))


def ModSelect(Scope,Site):
    if Site == 'Illisarvik':
        if Scope == 'Full':
            Model = ['H','wind_speed','air_pressure','PPFD_Avg','AirTC_Avg','VPD',
                    'Temp','VWC','Sedge','Shrub','Grass','Sparse','Out_of_Basin']
        if Scope == 'Test':
            Model = ['PPFD_Avg','wind_speed','Temp']#,'VWC','Sedge']
    if Site == 'FishIsland':
        BaseFactors = []
        if Scope == 'Full':
            Model = ['H','Wind Spd','air pressure','Ta','Rn','PPFD','Rain','Water Table',
            'Ts 2.5 cm','Ts 15 cm','VWC','Active Layer','24H Rain','Wtr Tbl Trnd']
        if Scope == 'Test':
            Model = ['H','Water Table','Wind Spd','Active Layer']
    return(Model)

def Combos(Model,L,factor=None):
    Models=[]
    for c in combinations(Model,L):
        c = list(c)
        if factor is None:
            Models.append(c)
        else:
            for f in factor:
                f = f.split('+')
                if set(f).issubset(set(c)) and c not in Models:
                    Models.append(c)
                    
    print('Models: ',Models)
    return(Models)

def Stats(mse,j,i,params):
    df = pd.DataFrame(index = [str(j)+'_'+str(i)],
                      data={'Model':[params['Model']],
                            'MSE':[mse.mean()],
                            'SE':[mse.std()/params['K']**.5],
                            'Performance':0})
    return(df)

def t(p,n):
    alpha = 1-p
    df = n-1
    return(stats.t.ppf(alpha,df))

# Model

In [None]:

MP=False
Scope = 'Test'
cwd = os.getcwd()
# for Site in ['Illisarvik','FishIsland']:
Site='Illisarvik'
target='fco2'

# params['Loss']='mean_absolute_error'

if __name__ == '__main__':
    if params['proc']>1:
        pool = Pool(processes=3,maxtasksperchild=75)
    else:
        pool = None
    
    XVarriables=ModSelect(Scope,Site)
    for j in range(1,len(XVarriables)+1):
        if j == 1:
            Inputs = (Combos(XVarriables,j))
        else:
            Inputs = (Combos(XVarriables,j,Factors))
        i = 0
        for Input in Inputs:
            params = Dense.Params(Scope,target,MP)
            params['Dpath'] = cwd+'/'+Site+'/'
            params['Spath'] = params['Dpath']+'/'+target+'/'+str(j)+'_'+str(i)+'/'
            try:
                os.mkdir(params['Spath'])
            except:
                pass
            params['Sname'] = 'Y_'
            params['Inputs'] = Input
            params['Model'] = '+'.join(params['Inputs'])

            RST = RSTF.ReadStandardTimeFill(params['Dpath']+'ECData.csv',resample='2H')
            RST.Scale(params['target'],params['Inputs'])
            y = RST.y*1.0
            X = RST.X*1.0

            params['N']=int(y.shape[0]/30)
#             mse = Test(params,X,y,RST.YScaled,RST.XScaled,pool)
            mse = Dense.RunNN(params,X,y,RST.YScaled,RST.XScaled,pool)
            if i == 0:
                Level = Stats(mse,j,i,params)
            else:
                Level = Level.append(Stats(mse,j,i,params))
            i += 1

        Min = Level.loc[Level['MSE']==Level['MSE']].min()
        T= t(0.05,params['K'])
        Factors = Level.loc[Level['MSE']<=Min['MSE']+Min['SE']*T,'Model'].values
        Level.loc[Level['MSE']<=Min['MSE']+Min['SE']*T,'Performance']=1
        
        if j == 1:
            Records = Level
        else:
            Records = Records.append(Level)

    if pool is not None:
        pool.close()


plt.figure()
Min = Records.loc[Records['MSE']==Records['MSE']].min()
Records.loc[Records['MSE']<=Min['MSE']+Min['SE']*T,'Performance']=2
T= t(0.05,params['K'])
Top = Records.loc[Records['Performance']==1]

Worst = Records.loc[Records['Performance']==0].index.values
for rm in Worst:
    shutil.rmtree(params['Dpath']+'/'+target+'/'+rm)
Best = Records.loc[Records['Performance']==2]
plt.bar(Top.index,Top['MSE'],yerr=Top['SE'])
plt.bar(Best.index,Best['MSE'],yerr=Best['SE'],color='red')

Models:  [['PPFD_Avg'], ['wind_speed'], ['Temp'], ['VWC'], ['Sedge']]
Epoch 00168: early stopping
Saved model to disk
Epoch 00163: early stopping
Epoch 00091: early stopping


  Y_hat_train_bar=np.nanmean(Y_hat_train,axis=0)
  r2_train = np.maximum((y_true[0,:]-Y_hat_train_bar)**2-Y_hat_train_var,0)


Saved model to disk
Done! 0.041502248547635796
Epoch 00055: early stopping
Saved model to disk
Epoch 00068: early stopping
Epoch 00007: early stopping
Saved model to disk
Done! 0.3107869941875794
Epoch 00006: early stopping
Saved model to disk
Epoch 00035: early stopping
Epoch 00085: early stopping
Saved model to disk
Done! 0.25763752268084134
Epoch 00057: early stopping
Saved model to disk
Epoch 00021: early stopping
Epoch 00006: early stopping
Saved model to disk
Done! 0.3068933085410452
Epoch 00055: early stopping
Saved model to disk
Epoch 00028: early stopping
Epoch 00006: early stopping
Saved model to disk
Done! 0.29388202297914967
Models:  [['PPFD_Avg', 'wind_speed'], ['PPFD_Avg', 'Temp'], ['PPFD_Avg', 'VWC'], ['PPFD_Avg', 'Sedge']]


Model    AirTC_Avg+Grass
MSE              0.34082
SE             0.0384341
dtype: object
1.6991270265334972
            Model       MSE        SE
44  AirTC_Avg+VWC  0.392928  0.042790
51        VPD+VWC  0.402128  0.045205
63      VWC+Sedge  0.395629  0.045843
65      VWC+Grass  0.340820  0.044978


# Sorting

# CI and PI

# Map Results

In [117]:
Fill = False
if Fill == True:
    X = RST.X_fill
else:
    X = RST.X
print(X.shape)
params['Sname']='Test'
params['Loss']='mean_absolute_error'
Y_fill = []
Y_bar = []
MSE = []
for i in range(params['K']):
    params['iteration']=i
    Empty_Mod = Dense.Load_Model(params)
    Model = Dense.Load_Weights(Empty_Mod,params) 
    Y = RST.YScaled.inverse_transform(Model.predict(X).reshape(-1,1))
    Y_bar.append(RST.YScaled.inverse_transform(Model.predict(np.median(X,axis=0)).reshape(-1,1)))
    if Fill == False:
        mse = (metrics.mean_squared_error(RST.y,Y))
        MSE.append(mse)
    Y_fill.append(Y)
Y_fill = np.asanyarray(Y_fill).mean(axis=-1)
Y_bar = np.asanyarray(Y_bar).mean(axis=-1)
Y_fill_bar = Y_fill.mean(axis=0)
Y_bar = Y_bar.mean(axis=0)
if Fill == False:
    MSE = np.asanyarray(MSE)
    CI = stats.t.ppf(1-0.025,i)*MSE.std()/(i)**.5
    print(CI)
    
print(Y_bar,Y_fill_bar.mean())

params['Sname']='Var'
params['iteration']=1
params['Loss']='Boot_Loss'
Empty_Mod = Dense.Load_Model(params)
Model = Dense.Load_Weights(Empty_Mod,params) 
YVar=YScaled.inverse_transform(Model.predict(X).reshape(-1,1))
YVar_bar=YScaled.inverse_transform(Model.predict(X.mean(axis=0)).reshape(-1,1))
X_back = np.squeeze(RST.XScaled.inverse_transform(X))

print(RST.YScaled.inverse_transform(RST.y).shape,np.squeeze(Y_fill_bar).shape)

Data = pd.DataFrame(data=X_back,columns=params['Inputs'])
Data[target] = np.squeeze(Y_fill_bar)
Data['True'] = RST.Master[target]#.YScaled.inverse_transform(RST.y)
Data['SE'] = 1/(params['K']-1)*((Y_fill-Y_fill_bar)**2).sum(axis=0)
Data['Var'] = np.squeeze(YVar)
Data['CI']=stats.t.ppf(1-0.025,params['K'])*(Data['SE'])**.5
Data['PI']=stats.t.ppf(1-0.025,params['K'])*((Data['Var']+Data['SE'])**.5) #the accuracy of our estimate with respect to the observed output

print(Data['CI'].mean())
print(Data[target].mean())
print(Data['True'].mean())
Data['Fill'] = Data['True'].fillna(Data[target])
print(Data['Fill'].mean())

# plt.figure(figsize=(8,7))
# Data = Data.sort_values(by='PPFD_Avg')
# Data.index = Data.PPFD_Avg


# plt.scatter(Data.index,Data['True'],edgecolor='black',facecolor='white')
# plt.plot(Data.index,Data[target],label=
#         params['target']+' Model\nRMSE: '+str(np.round(metrics.r2_score(Data['True'],
#                                                                    Data[params['target']])**2,3)))
# # plt.plot(Data.index,Data['Var'],label= params['target']+
# # ' Model\nRMSE: '+str(np.round(metrics.mean_squared_error(Data['True'],
# #                                                                    Data[params['target']])**2,3)))


# plt.fill_between(Data.index, Data[target]-Data['PI'], 
#                  Data[target]+Data['PI'],  color = 'green', alpha = 0.4, 
#                  label = '95% PI')
# plt.fill_between(Data.index, Data[target]-Data['CI'], 
#                  Data[target]+Data['CI'],  color = 'red', alpha = 0.4, 
#                  label = '95% CI')
# plt.legend()

(910, 1)
0.0023568670271235147
[-0.35396278] -0.28427064
(910,) (910,)
0.02232573740184307
-0.2842707633972168
nan
-0.28427064746725733


# The "Optimum" Sized Model

In [None]:
# pool.close()
# Site = 'Illisarvik'#'FishIsland'#
Scope = 'Test'
cwd = os.getcwd()

# def Params(Func,Y,MP = True):
#     params = {}
#     params['proc']=3
#     if MP == False:
#         params['proc']=1
#     if Func == 'Full':
#         epochs = 200
#         K = 30
#         splits_per_mod = 1
#         N = np.linspace(200,20,10,dtype='int32')
#     elif Func == 'Test':
#         epochs = 200
#         K = 30
#         splits_per_mod = 1
#         N = np.linspace(70,10,5,dtype='int32')
#     N = np.repeat(N,K)
#     d = {'N':N.astype(int)}
#     Runs = pd.DataFrame(data=d)
#     Runs['MAE'] = 0.0
#     Runs['R2'] = 0.0
#     Runs['Model']=0
#     params['K'] = K
#     params['epochs'] = epochs
#     params['Y'] = Y
#     params['splits_per_mod'] = splits_per_mod
#     params['Save'] = {}
#     params['Save']['Weights']=False
#     params['Save']['Model']=False
    
#     return(Runs,params)


# MP=False

# if Scope == 'Full':
#     MP = True
# if __name__=='__main__'and MP==True:
#     pool = Pool(processes=3,maxtasksperchild=75)
# else:pool=None
    
# # for Site in ['Illisarvik','FishIsland']:
# Site='Illisarvik'
# FillVar = 'fco2'
# #     for FillVar in ['fco2','fch4']:
# Runs,params = MF.Params(Scope,FillVar,MP)
# FullModel = ModSelect(Scope,Site)
# print(FullModel)
# params['Dpath'] = cwd+'/'+Site+'/'
# params['Prelim_N']=True
# Best,Scores,ModelRuns = MF.FactorTest(params,FullModel,Runs)
# print(Best,Scores)
# Scores,ModelRuns = Best_Fill(Best,Runs,Scores,params)
# Scores.to_csv(params['Dpath']+FillVar+'/GapFillingSummary.csv')
# ModelRuns.to_csv(params['Dpath']+FillVar+'/GapFilled.csv')

# if __name__=='__main__'and MP==True:
#     pool.close()

In [None]:
# Grp = Scores.groupby('Model').mean()
# Grp['SE'] = Scores[['Model','MAE']].groupby('Model').sem()
# # Grp['SE'] = Scores[['Key','MAE']].groupby('Key').sem()
# print(Grp)
# # plt.bar(Grp.index,Grp['MAE'],yerr=Grp['SE'])

In [None]:
print('kitty')