##  Optimze and a Dense Neural Network for gap filling and feature identification

** With a few tweaks to RepRunner, an LSTM can be run instead

In [18]:
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from functools import partial
from multiprocessing import Pool
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split

## Personal Modules
import ReadStandardTimeFill as RSTF
import importlib
import DenseNet as Dense
importlib.reload(Dense)
importlib.reload(RSTF)

%matplotlib inline
%config IPCompleter.greedy=True

from scipy.optimize import minimize, curve_fit
from scipy.stats import norm
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from matplotlib import cm

from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

from ipywidgets import FloatProgress
from IPython.display import display
import os  

In [19]:
def Curve(x,a,b,c):
    return (a*x**2+b*x+c)

def TTV_Split(iteration,Memory,X,y,params,X_fill):
    params['seed'] = int(iteration%params['splits_per_mod']/params['splits_per_mod']*100)
    params['iteration'] = int(iteration/params['splits_per_mod'])
    X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.1, random_state=params['seed'])
    X_train,X_val,y_train,y_val=train_test_split(X_train,y_train, test_size=0.11, random_state=params['seed'])
    return(Dense.Train_Steps(params,X_train,X_test,X_val,y_train,y_test,
        y_val,X_fill = X_fill,Memory=Memory),
        y_val)

def RunReps(Model,params,pool = None,FullReturn=False):
    RST = RSTF.ReadStandardTimeFill(params['Dpath']+'ECData.csv')
    offset = 5/params['proc']
    Memory = (math.floor(100/params['proc'])- offset) * .01
    MSE = []
    RST.Scale(params['Y'],Model)
    y = RST.y*1.0
    X = RST.X*1.0
    X_fill = RST.X_fill*1.0
    RMSE = []
    R2 =[]
    Y_fill = []
    Yval = []
    y_val= []
    if params['Prelim_N']==True:
        params['N']=int(y.shape[0]*.8/30)
    if __name__=='__main__'and params['proc'] != 1:
        for i,results in enumerate(pool.imap(partial(TTV_Split,Memory=Memory,X=X,y=y,params=params,X_fill=X_fill),range(params['K']))):
            Y_fill.append(results[0][0])
            Yval = RST.YScaled.inverse_transform(results[0][1].reshape(-1,1))
            y_val = RST.YScaled.inverse_transform(results[1].reshape(-1,1))
            RMSE.append(metrics.mean_absolute_error(Yval,y_val))
            R2.append(metrics.r2_score(Yval,y_val))
    else:
        for i in range(params['K']):
            results = TTV_Split(i,Memory,X,y,params,X_fill)
            Y_fill.append(results[0][0])
            Yval = RST.YScaled.inverse_transform(results[0][1].reshape(-1,1))
            y_val = RST.YScaled.inverse_transform(results[1].reshape(-1,1))
            RMSE.append(metrics.mean_absolute_error(y_val,Yval))
            R2.append(metrics.r2_score(y_val,Yval))
    Y_fill = np.asanyarray(Y_fill)
    Y_fill = Y_fill.mean(axis=0) 
    FillVarName = FillVar.replace('f','F')
    RST.Fill(Y_fill,FillVarName)
    return(RMSE,R2,RST.Master['TempFill'])

def RunLoop(Runs,model):
    style = {'description_width': 'initial'}
    f = FloatProgress(min=0, max=Runs['N'].count(),description='Finding Best N:',style=style) # instantiate the bar
    display(f) # display the bar
    i = 0
    for N in Runs['N'].unique():
        params['T'] = 0
        params['N']=N
        Results = RunReps(model,params,pool)
        RMSE = Results[0]
        R2 = Results[1]
        Runs.loc[Runs['N']==N,'RMSE']=RMSE
        Runs.loc[Runs['N']==N,'R2']=R2
        f.value+=params['K']
        rmse = Runs.loc[Runs['RMSE']!=0].groupby('N').mean()#.values[0]
        rmse['se'] = Runs.loc[Runs['RMSE']!=0,['N','RMSE']].groupby('N').sem()#.values[0]
        Val = rmse['RMSE'].min()+rmse.loc[rmse['RMSE']==rmse['RMSE'].min(),'se'].values[0]
        if i > 2:
            if np.asanyarray(RMSE).mean()>Val:
                Runs = Runs.loc[Runs['RMSE']!=0]
                print('Leaving at ',N)
                break
        i+=1                
    return(Runs)

def N_Nodes(Runs,params,model,plot=False):
    params['Prelim_N']=False
    Runs = RunLoop(Runs,model)
    Group = Runs.groupby('N').mean()
    Group['Count'] = Runs[['N','RMSE']].groupby('N').count()
    Group['SD'] = Runs[['N','RMSE']].groupby('N').std()
    Group['SE'] = Runs[['N','RMSE']].groupby('N').sem()
    Group = Group.reset_index()
    Min = Group.loc[Group['RMSE']==Group['RMSE'].min()]
    popt, pcov = curve_fit(Curve, Runs['N'].values, Runs['RMSE'].values)
    N = np.arange(Runs['N'].min(),Runs['N'].max())
    Fit = Curve(N,*popt)
    Max = (Min['RMSE']+Min['SE']).values[0]
    try:
        params['N']=N[Fit<Max].min()
    except:
        print('Fitting failed to produce meaningful results')
        params['N'] = Group.loc[Group['RMSE']<+Min['RMSE'].values[0]+Min['SE'].values[0],'N'].values[0]
        pass
    
    if plot == True:
        fig,ax = plt.subplots(figsize=(8,8))
        ax.scatter(Runs['N'],Runs['RMSE'])
        ax.plot(N,Fit,color='r')
        ax.plot(N,N*0+Max,color = 'black')
        plt.grid()
        plt.show()
    return(params,Runs,Group)

def Combos(Model,L,factor=None,BaseFactors=[]):
    Models=[]#BaseFactors#list()
    for c in combinations(Model,L):
        c = list(c)+BaseFactors
        if factor is None:
            Models.append(c)
        else:
            for f in factor:
                f = f.split('+')
                if set(f).issubset(set(c)) and c not in Models:
                    Models.append(c)
                    
    print('Models: ',Models)
    return(Models)

def FactorTest(params,FullModel,Runs,BaseFactors = []):
    prog1 = FloatProgress(min=len(BaseFactors), max=len(FullModel+BaseFactors)-1,description='FactorTesting:') # instantiate the bar
    display(prog1) # display the bar
    Scores = pd.DataFrame()
    ModelRuns = pd.DataFrame()
    Start = 1
    for level in range(Start,len(FullModel+BaseFactors)+1):
        if level == 1:
            Models = Combos(FullModel,level)
#         elif BaseFactors != [] and i-Start == 0:
#             Models = Combos(FullModel,level-Start+1,BaseFactors=BaseFactors)
#         elif BaseFactors != [] and i-Start > 0:
#             Models = Combos(FullModel,level-Start+1,factor = Best,BaseFactors=BaseFactors)
        else:
            Models = Combos(FullModel,level,factor = Best,BaseFactors=BaseFactors)
        print('Number of Models: ',len(Models))
        prog2 = FloatProgress(min=0, max=len(Models),description='Level: '+str(level)) # instantiate the bar
        display(prog2) # display the bar
        Values = []
        for j,model in enumerate(Models):
            key = str(level)+'_'+str(j)
            Name = '+'.join(model)
            Results = RunReps(model,params,pool)
            RMSE = Results[0]
            R2 = Results[1]
            d = {'Model':'+'.join(model),'RMSE':RMSE,'Key':key,'R2':R2,'Level':level}#,'Best':'N'
            score = pd.DataFrame(index=[x for x in range(params['K'])],data=d)
            Scores = Scores.append(score,ignore_index=True)## Might cause problems in older versions
            ModelRuns['Model: '+'+'.join(model)] = Results[2]    
            prog2.value+=1
        Group = Scores.groupby('Key').agg({'Model': 'first','Level': 'first','RMSE': 'mean','R2': 'mean'}) 
        Group['SE'] = Scores[['Key','RMSE']].groupby('Key').sem()
        Min = Group.loc[Group['Level']==level]
#         print(Min)
        Mod2Beat = Min.loc[Min['RMSE'] == Min['RMSE'].min()]
        print('Best Model @ Level ',level)
        print(Mod2Beat)
        if level>Start:
            print('Should we exit??', (Mod2Beat['RMSE']+Mod2Beat['SE']).values[0],LastMin)
            if (Mod2Beat['RMSE']+Mod2Beat['SE']).values[0]>LastMin:
                print('Exit Early')
                Scores = Scores.loc[Scores['Level']<level]
                break
        LastMin = Min['RMSE'].min()
        Best = Mod2Beat['Model'].values
        prog1.value+=1
    Group = Scores.groupby('Key').agg({'Model': 'first','Level': 'first','RMSE': 'mean','R2': 'mean'})
    Group['SE'] = Scores[['Key','RMSE']].groupby('Key').sem()
    Min = Group.loc[Group['Level']==level]
    return(Best,Scores,ModelRuns)

def Best_Fill(Best,Runs,Scores,params):
    N = Runs['N']
    NRuns = Runs*0
    NRuns['N']=N
    model = Best[0].split('+')

    params,Runs,Group = N_Nodes(NRuns,params,model,plot=True)
    level = 0
    key = '0_'+str(params['N'])
    Name = '+'.join(Best)
    Results = RunReps(model,params,pool)
    MSE = Results[0]
    R2 = Results[1]
    d = {'Model':'+'.join(model),'RMSE':MSE,'Key':key,'R2':R2,'Level':level}#,'Best':'N'
    score = pd.DataFrame(index=[x for x in range(params['K'])],data=d)
    Scores = Scores.append(score,ignore_index=True)
    ModelRuns['BestModel: '+'+'.join(model)] = Results[2] 
    print(params['N'])
    return(Scores,ModelRuns)

# The "Optimum" Sized Model

In [23]:
# pool.close()
# Site = 'Illisarvik'#'FishIsland'#
Scope = 'Test'
cwd = os.getcwd()
def ModSelect(Scope,Site):
    if Site == 'Illisarvik':
        BaseFactors = ['Sedge','Shrub','Grass','Sparse','Out_of_Basin']
        BaseFactors = []
        if Scope == 'Full':
            Model = ['H','wind_speed','air_pressure','PPFD_Avg','AirTC_Avg','VPD',
                    'Temp','VWC','Sedge','Shrub','Grass','Sparse','Out_of_Basin']
        if Scope == 'Test':
            Model = ['PPFD_Avg','wind_speed','air_pressure']
    if Site == 'FishIsland':
        BaseFactors = []
        if Scope == 'Full':
            Model = ['H','Wind Spd','air pressure','Ta','Rn','PPFD','Rain','Water Table',
            'Ts 2.5 cm','Ts 15 cm','VWC','Active Layer','24H Rain','Wtr Tbl Trnd']
        if Scope == 'Test':
            Model = ['H','Water Table','Wind Spd','Active Layer']
    return(Model,BaseFactors)

def Params(Func,Y,MP = True):
    params = {}
    params['proc']=3
    if MP == False:
        params['proc']=1
    if Func == 'Full':
        epochs = 200
        K = 30
        splits_per_mod = 1
        N = np.linspace(100,10,8,dtype='int32')
    elif Func == 'Test':
        epochs = 200
        K = 30
        splits_per_mod = 1
        N = np.linspace(70,10,5,dtype='int32')
    N = np.repeat(N,K)
    d = {'N':N.astype(int)}
    Runs = pd.DataFrame(data=d)
    Runs['RMSE'] = 0.0
    Runs['R2'] = 0.0
    Runs['Model']=0
    params['K'] = K
    params['epochs'] = epochs
    params['Y'] = Y
    params['splits_per_mod'] = splits_per_mod
    params['Save'] = {}
    params['Save']['Weights']=False
    params['Save']['Model']=False
    
    return(Runs,params)


MP=False
if Scope == 'Full':
    MP = True
if __name__=='__main__'and MP==True:
    pool = Pool(processes=3,maxtasksperchild=75)
else:pool=None
    
# for Site in ['Illisarvik','FishIsland']:
Site='Illisarvik'
FillVar = 'fco2'
#     for FillVar in ['fco2','fch4']:
print(Site,FillVar)
FullModel,BaseFactors = ModSelect(Scope,Site)
Runs,params = Params(Scope,FillVar,MP)
params['Dpath'] = cwd+'/'+Site+'/'
params['Prelim_N']=True
Best,Scores,ModelRuns = FactorTest(params,FullModel,Runs,BaseFactors)
# print(Best,Scores)
# Scores,ModelRuns = Best_Fill(Best,Runs,Scores,params)
#         Scores.to_csv(params['Dpath']+FillVar+'/GapFillingSummary.csv')
#         ModelRuns.to_csv(params['Dpath']+FillVar+'/GapFilled.csv')

if __name__=='__main__'and MP==True:
    pool.close()

Illisarvik fco2


A Jupyter Widget

Models:  [['PPFD_Avg'], ['wind_speed'], ['air_pressure']]
Number of Models:  3


A Jupyter Widget

Best Model @ Level  1
        Model  Level      RMSE        R2        SE
Key                                               
1_0  PPFD_Avg      1  0.140719  0.849989  0.000567
Models:  [['PPFD_Avg', 'wind_speed'], ['PPFD_Avg', 'air_pressure']]
Number of Models:  2


A Jupyter Widget

Best Model @ Level  2
                   Model  Level      RMSE        R2        SE
Key                                                          
2_0  PPFD_Avg+wind_speed      2  0.131356  0.871349  0.000634
Should we exit?? 0.13199035139976617 0.14071856261452484
Models:  [['PPFD_Avg', 'wind_speed', 'air_pressure']]
Number of Models:  1


A Jupyter Widget

Best Model @ Level  3
                                Model  Level      RMSE        R2        SE
Key                                                                       
3_0  PPFD_Avg+wind_speed+air_pressure      3  0.130259  0.873592  0.000811
Should we exit?? 0.13107070699061119 0.13135601700261232


In [22]:
Grp = Scores.groupby('Model').mean()
Grp['SE'] = Scores[['Model','RMSE']].groupby('Model').sem()
# Grp['SE'] = Scores[['Key','RMSE']].groupby('Key').sem()
print(Grp)
# plt.bar(Grp.index,Grp['RMSE'],yerr=Grp['SE'])

                                      RMSE        R2  Level        SE
Model                                                                
PPFD_Avg                          0.139139  0.852660      1  0.000998
PPFD_Avg+air_pressure             0.135732  0.857485      2  0.003737
PPFD_Avg+wind_speed               0.131556  0.871120      2  0.001601
PPFD_Avg+wind_speed+air_pressure  0.128238  0.876745      3  0.000784
air_pressure                      0.422236 -0.080158      1  0.002737
wind_speed                        0.422214 -0.090601      1  0.000977
