In [16]:
import pickle
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns
import scipy.stats as stats
from gretel_synthetics.timeseries_dgan.dgan import DGAN
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from timeVAE.vae_dense_model import VariationalAutoencoderDense as VAE_Dense
from timeVAE.vae_conv_model import VariationalAutoencoderConv as VAE_Conv
from timeVAE.vae_conv_I_model import VariationalAutoencoderConvInterpretable as TimeVAE
from timeVAE import utils
from sklearn.preprocessing import MinMaxScaler

In [99]:
MODEL = 1

In [100]:
index = [0,3] if MODEL==0 else [1,2]
model_name = 'th_v_air' if MODEL==0 else 'el_v_sky'

base_data_train, base_data_test = np.load('../../data/training_data/training_data_1month.npy', allow_pickle=True)

base_data_train, base_data_test = base_data_train[:,:,index], base_data_test[:,:,index]
print(base_data_train.shape)

(108, 730, 2)


In [101]:
from sklearn.preprocessing import MinMaxScaler

scalers = {var_name: MinMaxScaler(feature_range=(-1,1)) for var_name in ['G.air.T', 'G.E_th_I']}

temp_var, energy_var = base_data_train[:,:,0], base_data_train[:,:,1]
temp_var_test, energy_var_test = base_data_test[:,:,0], base_data_test[:,:,1]

temp_var, temp_var_test = scalers['G.air.T'].fit_transform(temp_var), scalers['G.air.T'].fit_transform(temp_var_test)
energy_var, energy_var_test = scalers['G.E_th_I'].fit_transform(energy_var), scalers['G.E_th_I'].fit_transform(energy_var_test)

base_data_train_scaled, base_data_test_scaled = np.stack((temp_var, energy_var), axis=-1), np.stack((temp_var_test, energy_var_test), axis=-1)
print(base_data_train_scaled.shape, base_data_test_scaled.shape)

(108, 730, 2) (12, 730, 2)


Load in tVAE models

In [102]:
batches=[4,8,16,20,24,32]
latent_dims=[3,5,10,15,20,25,30,50]

vae_list = []
for b in batches:
    latent_dim_list = []
    for l in latent_dims:
        samples = TimeVAE.load('../../data/models/model_data/',f'tVAE_{model_name}_b{b}l{l}')
        sample = samples.get_prior_samples(num_samples=1000)
        latent_dim_list.append(sample)
    vae_list.append(latent_dim_list)



In [13]:
print(len(vae_list), len(vae_list[0]), vae_list[0][0].shape)

6 8 (1000, 730, 2)


Load in DGAN models

In [5]:
batches=[2,4,6,8,10,12,16,20,24,32]
epochs=[100,500,1000]


gan_list = []
for b in batches:
    epoch_list = []
    for e in epochs:
        model = DGAN.load(f'../../data/models/model_data/DGAN_{model_name}_b{b}_e{e}.zip')
        attributes, samples = model.generate_numpy(1000)
        epoch_list.append(samples)
    gan_list.append(epoch_list)

In [9]:
print(len(gan_list), len(gan_list[0]), gan_list[0][0].shape)

10 3 (108, 730, 2)


In [32]:
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster 

model_params = {'lag':24, 'max_depth':5, 'n_est':50} if MODEL==0 else {'lag':2, 'max_depth':3, 'n_est':10}

def find_best_model_regression(synthetic_data):
    
    forecaster = ForecasterAutoreg(
                     regressor = RandomForestRegressor(max_depth=model_params['max_depth'], n_estimators =model_params['n_est']),
                     lags      = model_params['lag']
                 )

    forecaster.fit(y=synthetic_data['G.E_th_I'], exog=synthetic_data['G.air.T'])

    y_test = pd.Series(base_data_test_scaled[:,:,1].reshape(-1))
    exog_var = pd.Series(base_data_test_scaled[:,:,0].reshape(-1))
    exog_var.index = exog_var.index + synthetic_data['G.air.T'].index.max() + 1
    
    predictions = forecaster.predict(exog=exog_var, steps=len(y_test))
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2  = r2_score(y_test, predictions)

    return {'mse':mse, 'mae':mae, 'r2':r2}

In [8]:
vae_results = {}
for i,b in enumerate(vae_list):
    for j,e in enumerate(b):
        result = find_best_model_regression(pd.DataFrame(e[0:108].reshape(-1,2), columns=['G.air.T', 'G.E_th_I']))
        vae_results[f'b{i} l{j}'] = result
        print(result, f'{i} {j}')

{'mse': 0.4402006607958828, 'mae': 0.4617763260035185, 'r2': -0.06843358201723926} 0 0
{'mse': 0.43603810649143954, 'mae': 0.46379374496036957, 'r2': -0.05833043315372621} 0 1
{'mse': 0.5109269668526675, 'mae': 0.44338293096989195, 'r2': -0.24009702383596343} 0 2
{'mse': 0.4579303688417816, 'mae': 0.45365813160778806, 'r2': -0.11146626497902878} 0 3
{'mse': 0.4167695883005231, 'mae': 0.4842208036306238, 'r2': -0.0115628252322304} 0 4
{'mse': 0.9470962246707268, 'mae': 0.9144815460631127, 'r2': -1.298745781095413} 0 5
{'mse': 0.4514828963894058, 'mae': 0.46288605923007486, 'r2': -0.09581727418743258} 0 6
{'mse': 0.41526566072461124, 'mae': 0.49035215916288616, 'r2': -0.007912565543567451} 0 7
{'mse': 0.4561382523689026, 'mae': 0.4591662106914572, 'r2': -0.10711652725022036} 1 0
{'mse': 0.560651150425519, 'mae': 0.44452923171665365, 'r2': -0.3607851379145999} 1 1
{'mse': 0.501160093702824, 'mae': 0.4448260798012389, 'r2': -0.2163913455079749} 1 2
{'mse': 0.3945622120221025, 'mae': 0.5113

In [25]:
sorted_models_mae = sorted(vae_results.items(), key=lambda x: x[1]['mae'])
sorted_models_mse = sorted(vae_results.items(), key=lambda x: x[1]['mse'])
sorted_models_r2 = sorted(vae_results.items(), key=lambda x: x[1]['r2'], reverse=True)

print(sorted_models_mae[0:3], '\n', sorted_models_mse[0:3], '\n', sorted_models_r2[0:3])

KeyError: 'mae'

In [15]:
results_df = pd.DataFrame.from_dict(vae_results, orient='index')
results_df.to_csv(f'../../data/models/model_history/VAE_{model_name}_model_results.csv')
print(results_df)

NameError: name 'vae_results' is not defined

In [None]:
gan_results = {}
for i,b in enumerate(gan_list):
    for j,e in enumerate(b):
        result = find_best_model_regression(pd.DataFrame(e[0:108].reshape(-1,2), columns=['G.air.T', 'G.E_th_I']))
        gan_results[f'b{i} l{j}'] = result
        print(result, f'{i} {j}')

In [None]:
sorted_models_mae = sorted(gan_results.items(), key=lambda x: x[1]['mae'])
sorted_models_mse = sorted(gan_results.items(), key=lambda x: x[1]['mse'])
sorted_models_r2 = sorted(gan_results.items(), key=lambda x: x[1]['r2'], reverse=True)

print(sorted_models_mae[0:3], '\n', sorted_models_mse[0:3], '\n', sorted_models_r2[0:3])

In [None]:
results_df = pd.DataFrame.from_dict(gan_results, orient='index')
results_df.to_csv(f'../../data/models/model_history/DGAN_{model_name}_model_results.csv')
print(results_df)

In [96]:
vae_results = pd.read_csv(f'../../data/models/model_history/VAE_{model_name}_model_results.csv', index_col=0)
gan_results = pd.read_csv(f'../../data/models/model_history/DGAN_{model_name}_model_results.csv', index_col=0)

#print(vae_results, '\n', gan_results)

In [97]:
best_vaes = vae_results['mse'].nsmallest(5)
best_gans = gan_results['mse'].nsmallest(5)

print(best_vaes)
print(best_gans)

b5 l1    0.530135
b4 l5    0.532272
b5 l4    0.536739
b0 l5    0.538737
b5 l0    0.541052
Name: mse, dtype: float64
b4 e0    0.537730
b9 e2    0.579713
b5 e0    0.582607
b0 e0    0.586430
b8 e2    0.587330
Name: mse, dtype: float64


In [105]:
import re
batches=[4,8,16,20,24,32]
latent_dims=[3,5,10,15,20,25,30,50]

for i, scores in enumerate(best_vaes):
    print(scores, best_vaes.index[i])
    
    b, l = map(int, re.findall(r'\d+', best_vaes.index[i]))
    print(batches[b], latent_dims[l])
    
    vae = TimeVAE.load('../../data/models/model_data/',f'tVAE_{model_name}_b{batches[b]}l{latent_dims[l]}')
    samples = vae.get_prior_samples(num_samples=1000)
    
    np.save(f'../../data/vae_synthetic_data/{model_name}_b{batches[b]}l{latent_dims[l]}_generated_samples.npy', samples)

    temp, energy = scalers['G.air.T'].inverse_transform(samples[:,:,0]), scalers['G.E_th_I'].inverse_transform(samples[:,:,1])
    np.save(f'../../data/vae_synthetic_data/{model_name}_b{batches[b]}l{latent_dims[l]}_rescaled_samples.npy', np.stack((temp, energy),axis=-1))

0.5301347479816934 b5 l1
32 5
0.5322723807857386 b4 l5
24 25
0.5367392105418513 b5 l4
32 20
0.5387367736137983 b0 l5
4 25
0.5410523136136406 b5 l0
32 3


In [74]:
batches=[2,4,6,8,10,12,16,20,24,32]
epochs=[100,500,1000]

for i, scores in enumerate(best_gans):
    print(scores, best_gans.index[i])
    
    b, e = map(int, re.findall(r'\d+', best_gans.index[i]))
    print(batches[b], epochs[e])
    
    dgan = DGAN.load(f'../../data/models/model_data/DGAN_{model_name}_b{batches[b]}_e{epochs[e]}.zip')
    attributes, samples = dgan.generate_numpy(1000)
    
    np.save(f'../../data/gan_synthetic_data/{model_name}_b{batches[b]}e{epochs[e]}_generated_samples.npy', samples)

    temp, energy = scalers['G.air.T'].inverse_transform(samples[:,:,0]), scalers['G.E_th_I'].inverse_transform(samples[:,:,1])
    np.save(f'../../data/gan_synthetic_data/{model_name}_b{batches[b]}e{epochs[e]}_rescaled_samples.npy', np.stack((temp, energy),axis=-1))

0.4127741324562375 b9 e2
32 1000
0.4145561139925519 b7 e0
20 100
0.4220499112934464 b9 e1
32 500
0.4250164590716146 b4 e2
10 1000
0.4274020755166289 b4 e0
10 100


In [91]:
unscaled = (np.load('../../data/models/model_data/th_v_air_l15b8_samples.npy')[:1000,:,:])
rescaled = (np.load('../../data/models/model_data/th_v_air_l15b8_scaled_samples.npy')[:1000,:,:])

print(unscaled.shape, rescaled.shape)
np.save('../../data/vae_synthetic_data/th_v_air_b8l15_generated_samples.npy', unscaled)
np.save('../../data/vae_synthetic_data/th_v_air_b8l15_rescaled_samples.npy', rescaled)

(1000, 730, 2) (1000, 730, 2)
