In [41]:
import numpy as np
import configparser
import pandas as pd
import datetime

In [42]:
#Read Config Text File
configParser = configparser.RawConfigParser()
configPath = '../config/config.txt'
configParser.read(configPath)

paths = dict(configParser.items('FILEPATHS'))
fieldMappings = dict(configParser.items('MAPPINGS'))
defaults = dict(configParser.items('default'))
flows = dict(configParser.items('FLOW'))

inputPath = paths['inputpath']
outputPath = paths['outputpath']
deviceName = defaults['storagetype']
frequency = defaults['frequency']
forecasthorizon = defaults['forecasthorizon']
forecasthorizon = int(forecasthorizon)

accountname = defaults['accountname']

In [43]:
components = ['volume']
component = components[0]

In [44]:
savepath = f"../data/processedOutputs/{accountname}_{component}_processed.csv"
data = pd.read_csv(savepath,low_memory=False)

In [45]:
uniqcomponents = data[['system','pool','volume']].drop_duplicates()
uniqcomponents['compid'] = np.arange(len(uniqcomponents))

In [46]:
#join compid back to original dataframe and drop [system-pool-volume]
data = data.merge(uniqcomponents,on=['system','pool','volume'])
data = data.drop(['system','pool','volume'],axis=1)

In [47]:
def findzerocapacity(df):
    tmpdf = df.groupby(['compid'])['volumeutilization'].mean().reset_index()
    tmpdf = tmpdf[tmpdf['volumeutilization'] > 0]
    df = df[df['compid'].isin(tmpdf['compid'])]
    return df

In [48]:
data = findzerocapacity(data)

In [49]:
data['date'] = data['date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d'))
data.set_index('date', inplace = True)

In [50]:
data = data[['compid','volumeutilization']]

In [51]:
sequence_in = 14
sequence_out = 7

def gen_sequence(id_df,seq_in,seq_out,seq_cols):
    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0]
    for start, stop in zip(range(0, num_elements-seq_in-seq_out), range(seq_in, num_elements-seq_out)):
        yield data_matrix[start:stop, :]
        
def gen_labels(id_df,seq_in,seq_out,label):
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    for start,stop in zip(range(seq_in,num_elements-seq_out),range(seq_in+seq_out,num_elements)):
        yield data_matrix[start:stop,:]

In [52]:
#find max week
maxdate = data.index.max()
cutoff = maxdate - datetime.timedelta(days=30)

In [53]:
data = data.fillna(0)

In [54]:
X_train,X_test = [],[]

for comps in data['compid'].unique():
    for sequence in gen_sequence(data[(data['compid'] == comps) & (data.index <= cutoff)],sequence_in,sequence_out,['volumeutilization']):
        X_train.append(sequence)
    for sequence in gen_sequence(data[(data['compid'] == comps)& (data.index > cutoff)],sequence_in,sequence_out,['volumeutilization']):
        X_test.append(sequence)


In [55]:
y_train,y_test = [],[]

for comps in data['compid'].unique():
    for sequence in gen_labels(data[(data['compid'] == comps) & (data.index <= cutoff)],sequence_in,sequence_out,['volumeutilization']):
        y_train.append(sequence)
    for sequence in gen_labels(data[(data['compid'] == comps)& (data.index > cutoff)],sequence_in,sequence_out,['volumeutilization']):
        y_test.append(sequence)

In [56]:
X_train,X_test = np.asarray(X_train),np.asarray(X_test)
y_train,y_test = np.asarray(y_train),np.asarray(y_test)

In [57]:
X_test.shape

(2241, 14, 1)

In [58]:
y_test.shape

(2241, 7, 1)

In [59]:
# use entity embeddings
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [68]:
def build_model(train_x, train_y,h1=100,h2=50):
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # define model
    model = Sequential()
    model.add(layers.LSTM(h1, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(layers.Dense(h2, activation='relu'))
    model.add(layers.Dense(n_outputs))
    model.compile(loss='mse', optimizer='adam')
    return model

In [69]:
simpleseq = build_model(X_train,y_train)

In [70]:
history = simpleseq.fit(X_train,y_train,
                   epochs=20,
                   batch_size=128,
                   validation_split = 0.2,
                   verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [74]:
testpreds = simpleseq.predict(X_test)

In [79]:
from sklearn.metrics import mean_squared_error
ytesteval = y_test.reshape(y_test.shape[0],y_test.shape[1]*y_test.shape[2])
testmse = mean_squared_error(testpreds,ytesteval)
testmse

0.00421984653249681

In [85]:
#get last value from input data to predict future sequences
last_ts = maxdate - datetime.timedelta(days=sequence_in)
last_ts_df = data[data.index > last_ts]
last_ts_df.shape

(3486, 2)

In [90]:
fut_preds = []

for comps in last_ts_df['compid'].unique():
    tmpdf = last_ts_df[(last_ts_df['compid'] == comps)]
    tmpdf = tmpdf['volumeutilization'].values
    tmpdf = tmpdf.reshape(sequence_in,1)
    fut_preds.append(tmpdf)

In [91]:
fut_preds = np.asarray(fut_preds)
fut_preds.shape

(249, 14, 1)

In [94]:
futpreds = simpleseq.predict(fut_preds)

In [111]:
#create future dates
futdates = []
from pandas.tseries.offsets import DateOffset
for i in range(1,sequence_out+1):
    ts = maxdate + DateOffset(days=i)
    futdates.append(ts)

    
#

[Timestamp('2020-10-06 00:00:00'),
 Timestamp('2020-10-07 00:00:00'),
 Timestamp('2020-10-08 00:00:00'),
 Timestamp('2020-10-09 00:00:00'),
 Timestamp('2020-10-10 00:00:00'),
 Timestamp('2020-10-11 00:00:00'),
 Timestamp('2020-10-12 00:00:00')]

In [132]:
forecast_df = pd.DataFrame()
for each in range(len(futpreds)):
    xdf = futpreds[each]
    xdf[xdf < 0] = 0
    xdf[xdf > 1] = 1
    xdf = pd.DataFrame(futpreds[each],index=futdates)
    xdf['compid'] = each
    forecast_df = forecast_df.append(xdf)
    

In [133]:
forecast_df.index.name = 'date'
forecast_df.columns = ['volumeutilization','compid']
forecast_df = forecast_df[['compid','volumeutilization']]
forecast_df

Unnamed: 0_level_0,compid,volumeutilization
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-10-06,0,0.137103
2020-10-07,0,0.097359
2020-10-08,0,0.103423
2020-10-09,0,0.093287
2020-10-10,0,0.070450
...,...,...
2020-10-08,248,0.312970
2020-10-09,248,0.343661
2020-10-10,248,0.362627
2020-10-11,248,0.339272


In [134]:
forecast_df = forecast_df.reset_index().merge(uniqcomponents,on=['compid']).drop(['compid'],axis=1)
forecast_df

Unnamed: 0,date,volumeutilization,system,pool,volume
0,2020-10-06,0.137103,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
1,2020-10-07,0.097359,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
2,2020-10-08,0.103423,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
3,2020-10-09,0.093287,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
4,2020-10-10,0.070450,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
...,...,...,...,...,...
1738,2020-10-08,0.312970,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d4
1739,2020-10-09,0.343661,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d4
1740,2020-10-10,0.362627,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d4
1741,2020-10-11,0.339272,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d4


In [135]:
data.head()

Unnamed: 0_level_0,compid,volumeutilization
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-20,0,0.018312
2020-04-21,0,0.031167
2020-04-22,0,0.036475
2020-04-23,0,0.034542
2020-04-24,0,0.037558
