In [68]:
import pandas as pd
import numpy as np
import datetime
import configparser
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.models import Model, save_model,load_model
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

## Functions
def findzerovols(df):
    df['overallio'] = (df['overallreadio'] + df['overallwriteio'])/2
    tmpdf = df.groupby(['compid'])['overallio'].mean().reset_index()
    tmpdf = tmpdf[tmpdf['overallio'] > 0]
    df = df[df['compid'].isin(tmpdf['compid'])]
    df = df.drop(['overallio'],axis=1)
    return df

#add date features as independent variables
def adddatefeatures(df):
    df.loc[:,'dayofweek'] = df.index.dayofweek
    df.loc[:,'month'] = df.index.month
    df.loc[:,'isweekend'] = df['dayofweek'].apply(lambda x: 1 if x > 4 else 0)
    return df


def addtimedummies(data):
    cols = ['month','dayofweek']
    for col in cols:
        if col == 'month':
            max_val = 12
            N = max_val -1 #Since month index starts from 1 and dow and hour from 0
        elif col == 'dayofweek':
            max_val = 6
            N = max_val
        else:
            max_val = 23
            N = max_val
        df = list()
        series = data[col]
        for each in series:
            vals = list(np.zeros(N+1,dtype='int'))
            vals[each-1] = 1
            df.append(vals)
        names = [str(col)+'_' + str(x) for x in range(1,N+2)]
        df = pd.DataFrame(df,columns=names)
        df.set_index(data.index,inplace=True)
        data = pd.concat([data,df],axis=1)
    return data

def q_loss(q,y,f):
    e = (y-f)
    return K.mean(K.maximum(q*e, (q-1)*e), axis=-1)

#build nn using x_train,y_train

def perfmodel(X,y):
    inputs = layers.Input(shape=(X.shape[1],))
    x = layers.Dense(256, activation='relu')(inputs)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dropout(0.1)(x)
    y = layers.Dense(y.shape[1],activation='linear')(x)
    out10 = layers.Dense(y.shape[1])(x)
    out50 = layers.Dense(y.shape[1])(x)
    out90 = layers.Dense(y.shape[1])(x)
    model = Model(inputs=inputs,outputs=[out10,out50,out90])
    return model

def getcomponents(df,id):
    tempdf = df.loc[id,:]
    comp_vals = [x for x in tempdf.columns if 'comp_' in x]
    dummies = tempdf[comp_vals]
    s2 = dummies.idxmax(axis=1)
    s2 = pd.DataFrame(s2,columns=['compid'])
    s2['compid'] = s2['compid'].apply(lambda x: x.replace('comp_',''))
    return s2


#Read Config Text File
configParser = configparser.RawConfigParser()
configPath = '../config/config.txt'
configParser.read(configPath)

paths = dict(configParser.items('FILEPATHS'))
fieldMappings = dict(configParser.items('MAPPINGS'))
defaults = dict(configParser.items('default'))
flows = dict(configParser.items('FLOW'))

inputPath = paths['inputpath']
outputPath = paths['outputpath']
deviceName = defaults['storagetype']
frequency = defaults['frequency']
forecasthorizon = defaults['forecasthorizon']
forecasthorizon = int(forecasthorizon)

accountname = defaults['accountname']

model_savepath = "../outputs/perfmodel.h5"
rt_model_savepath = "../outputs/rtmodel.h5"

components = ['volume']
component = components[0]

savepath = f"../data/processedOutputs/{accountname}_{component}_processed.csv"
data = pd.read_csv(savepath,low_memory=False)
uniqcomponents = data[['system','pool','volume']].drop_duplicates()
uniqcomponents['compid'] = np.arange(len(uniqcomponents))
#join compid back to original dataframe and drop [system-pool-volume]
data = data.merge(uniqcomponents,on=['system','pool','volume'])
data = data.drop(['system','pool','volume'],axis=1)
#find volumes with zero activity and remove from analysis
data = findzerovols(data)
#change date formats and set data as index
data['date'] = data['date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d'))
data.set_index('date', inplace = True)
traindata = data.copy()
traindata = adddatefeatures(traindata)
traindata = traindata.dropna()

features = ['compid','dayofweek','month','isweekend']
targets = ['overallreadio','overallwriteio','readdatarate','writedatarate']
rt_features = features + targets
rt_targets = ['readresponsetime','writeresponsetime']

# xdata = traindata[features]
# ydata = traindata[targets]
# rtxdata = traindata[rt_features]
# rtydata = traindata[rt_targets]

ss_rt_x = MinMaxScaler()

ss_y = StandardScaler()
ss_rt_y = StandardScaler()

traindata.loc[:,"compid"] = traindata.loc[:,"compid"].astype(str)
# xdata.loc[:,"compid"] = xdata.loc[:,"compid"].astype(str)
# rtxdata.loc[:,"compid"] = rtxdata.loc[:,"compid"].astype(str)

#Dummy creation for each unqiue component
# compdf = pd.get_dummies(xdata[['compid']],prefix='comp')
# xdata = pd.concat([xdata,compdf],axis=1)
# rtxdata = pd.concat([rtxdata,compdf],axis=1)
# traincomps = xdata['compid'].unique()
# xdata = xdata.drop(['compid'],axis=1)
# rtxdata = rtxdata.drop(['compid'],axis=1)

compdf = pd.get_dummies(traindata[['compid']],prefix='comp')
traindata = pd.concat([traindata,compdf],axis=1)
traincomps = traindata['compid'].unique()
traindata = traindata.drop(['compid'],axis=1)
traindata = addtimedummies(traindata)
traindata = traindata.drop(['dayofweek','month'],axis=1)

traindata = traindata.drop(['volumeutilization'],axis=1)

# xdata = addtimedummies(xdata)
# rtxdata = addtimedummies(rtxdata)
# xdata = xdata.drop(['dayofweek','month'],axis=1)
# rtxdata = rtxdata.drop(['dayofweek','month'],axis=1)

losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]




In [69]:
#Start training here
kf = KFold(n_splits=5,shuffle=True,random_state=5)
perfmse = []
overallrtmse = []
finalpreds = pd.DataFrame()
rtpreds = pd.DataFrame()
n_fold = 1
traincols = [x for x in traindata.columns if x not in targets + rt_targets]
rtcols = [x for x in traindata.columns if x not in rt_targets]
trainfeatures = targets
rttargets = rt_targets
traindata = traindata.reset_index()
rthighmse = np.Inf
perfhighmse = np.Inf

for train_index,test_index in kf.split(traindata):
    print(f"##### TRAINING ON BATCH {n_fold} #####")
    x_train,x_val = traindata.loc[train_index,traincols].values,traindata.loc[test_index,traincols].values
    rt_x_train,rt_x_val = traindata.loc[train_index,rtcols].values,traindata.loc[test_index,rtcols].values
    val_index = traindata.loc[test_index,'date']
    y_train,y_val = traindata.loc[train_index,trainfeatures].values,traindata.loc[test_index,trainfeatures].values
    rt_y_train,rt_y_val = traindata.loc[train_index,rttargets].values,traindata.loc[test_index,rttargets].values
    rt_x_train = ss_rt_x.fit_transform(rt_x_train)
    rt_x_val = ss_rt_x.transform(rt_x_val)
    y_train = ss_y.fit_transform(y_train)
    rt_y_train = ss_rt_y.fit_transform(rt_y_train)
    y_val = ss_y.transform(y_val)
    rt_y_val = ss_rt_y.transform(rt_y_val)
    
    rtmodel = perfmodel(rt_x_train,rt_y_train)
    rtmodel.compile(loss=losses,optimizer='adam',loss_weights=[0.3,0.3,0.3])
    rtmodel.fit(rt_x_train,[rt_y_train,rt_y_train,rt_y_train],epochs=10,batch_size=128,verbose=0)
    rtpreds = rtmodel.predict(rt_x_val)
#     rtmedianPreds = ss_rt_y.inverse_transform(rtpreds)[1]
#     rtmedianPreds = pd.DataFrame(rtmedianPreds,index=val_index,columns=rt_targets)
#     comps = getcomponents(traindata,test_index)
#     comps.set_index(rtmedianPreds.index,inplace=True)
#     rtmedianPreds = pd.concat([rtmedianPreds,comps],axis=1)
#     finalrtpreds = finalrtpreds.append(rtmedianPreds)
    rtmse = mean_squared_error(rtpreds[1],rt_y_val)
    if rtmse < rthighmse:
        highmse = rtmse
        save_model(rtmodel,rt_model_savepath)
    overallrtmse.append(rtmse)
    
    simplenn = perfmodel(x_train,y_train)
    simplenn.compile(loss=losses, optimizer='adam', loss_weights = [0.3,0.3,0.3])
    simplenn.fit(x_train, [y_train,y_train,y_train], epochs=10, batch_size=128, verbose=0)
    preds = simplenn.predict(x_val)
    medianPreds = ss_y.inverse_transform(preds)[1]
    medianPreds = pd.DataFrame(medianPreds,index=val_index,columns=targets)
    comps = getcomponents(traindata,test_index)
    comps.set_index(medianPreds.index,inplace=True)
    medianPreds = pd.concat([medianPreds,comps],axis=1)
    finalpreds = finalpreds.append(medianPreds)
    mse = mean_squared_error(preds[1],y_val)
    if mse < perfhighmse:
        highmse = mse
        save_model(simplenn,model_savepath)
    perfmse.append(mse)
    n_fold += 1





##### TRAINING ON BATCH 1 #####
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x000002690B6783A0>. It was defined in this code:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

This code must contain a single distinguishable lambda. To avoid this problem, define each lambda in a separate expression.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x000002690B6783A0>. It was defined in this code:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

This code must contain a single distinguishable lambda. To avoid this 

In [70]:
finalpreds.loc[:,'compid'] = finalpreds.loc[:,'compid'].astype('int')
finalpreds = finalpreds.sort_values(['compid','date'],ascending=True)

In [71]:
finalpreds

Unnamed: 0_level_0,overallreadio,overallwriteio,readdatarate,writedatarate,compid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-20,19.073971,44.109894,0.516337,0.393048,0
2020-04-21,19.888210,42.391369,0.517538,0.373375,0
2020-04-22,17.674850,48.515800,0.397969,0.425908,0
2020-04-23,19.499998,41.656425,0.381544,0.385816,0
2020-04-24,21.189205,43.637123,0.487861,0.384859,0
...,...,...,...,...,...
2020-10-01,23.463127,44.581451,0.619000,0.612868,250
2020-10-02,29.784069,50.361458,0.745858,0.709597,250
2020-10-03,45.130028,39.641624,0.914884,0.624187,250
2020-10-04,201.909821,45.321743,1.471455,0.706312,250


In [72]:
#apply rtmodel on finalpreds
rtinputs = finalpreds.copy()
rtinputs = adddatefeatures(rtinputs)
rtinputs.loc[:,'compid'] = rtinputs.loc[:,'compid'].astype(str)
rtcomps = pd.get_dummies(rtinputs[['compid']],prefix='comp')
rtinputs = pd.concat([rtinputs,rtcomps],axis=1)
rtinputs = rtinputs.drop(['compid'],axis=1)
rtinputs = addtimedummies(rtinputs)
rtinputs = rtinputs.drop(['dayofweek','month'],axis=1)
del rtcomps

In [73]:
rtinputs_x = ss_rt_x.transform(rtinputs.loc[:,rtcols])
rtfinalpreds = rtmodel.predict(rtinputs_x)[1]
rtfinalpreds = ss_rt_y.inverse_transform(rtfinalpreds)
rtfinalpreds = pd.DataFrame(rtfinalpreds,columns=rt_targets,index=rtinputs.index)
rtfinalpreds['compid'] = finalpreds['compid']
rtfinalpreds

Unnamed: 0_level_0,readresponsetime,writeresponsetime,compid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-04-20,1.006856,0.313238,0
2020-04-21,1.108838,0.319549,0
2020-04-22,1.154171,0.449505,0
2020-04-23,0.990301,0.283002,0
2020-04-24,1.061763,0.335481,0
...,...,...,...
2020-10-01,14.679809,3.778234,250
2020-10-02,13.880250,4.030166,250
2020-10-03,14.340101,4.234398,250
2020-10-04,9.568090,2.877988,250


In [74]:
rtfinalpreds.loc[:,'compid'] = rtfinalpreds.loc[:,'compid'].astype('int')
rtfinalpreds = rtfinalpreds.sort_values(['compid','date'],ascending=True)

In [75]:
finalpreds = finalpreds.merge(rtfinalpreds,on=['date','compid'])

In [76]:
finalpreds

Unnamed: 0_level_0,overallreadio,overallwriteio,readdatarate,writedatarate,compid,readresponsetime,writeresponsetime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-20,19.073971,44.109894,0.516337,0.393048,0,1.006856,0.313238
2020-04-21,19.888210,42.391369,0.517538,0.373375,0,1.108838,0.319549
2020-04-22,17.674850,48.515800,0.397969,0.425908,0,1.154171,0.449505
2020-04-23,19.499998,41.656425,0.381544,0.385816,0,0.990301,0.283002
2020-04-24,21.189205,43.637123,0.487861,0.384859,0,1.061763,0.335481
...,...,...,...,...,...,...,...
2020-10-01,23.463127,44.581451,0.619000,0.612868,250,14.679809,3.778234
2020-10-02,29.784069,50.361458,0.745858,0.709597,250,13.880250,4.030166
2020-10-03,45.130028,39.641624,0.914884,0.624187,250,14.340101,4.234398
2020-10-04,201.909821,45.321743,1.471455,0.706312,250,9.568090,2.877988


In [77]:
predcols = ['compid','overallreadio','overallwriteio','readdatarate','writedatarate','readresponsetime','writeresponsetime']
finalpreds = finalpreds[predcols]
finalpreds

Unnamed: 0_level_0,compid,overallreadio,overallwriteio,readdatarate,writedatarate,readresponsetime,writeresponsetime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-20,0,19.073971,44.109894,0.516337,0.393048,1.006856,0.313238
2020-04-21,0,19.888210,42.391369,0.517538,0.373375,1.108838,0.319549
2020-04-22,0,17.674850,48.515800,0.397969,0.425908,1.154171,0.449505
2020-04-23,0,19.499998,41.656425,0.381544,0.385816,0.990301,0.283002
2020-04-24,0,21.189205,43.637123,0.487861,0.384859,1.061763,0.335481
...,...,...,...,...,...,...,...
2020-10-01,250,23.463127,44.581451,0.619000,0.612868,14.679809,3.778234
2020-10-02,250,29.784069,50.361458,0.745858,0.709597,13.880250,4.030166
2020-10-03,250,45.130028,39.641624,0.914884,0.624187,14.340101,4.234398
2020-10-04,250,201.909821,45.321743,1.471455,0.706312,9.568090,2.877988


In [78]:
predcols = [str(x)+'_preds' for x in predcols if x!='compid']
predcols = ['compid'] + predcols
finalpreds.columns = predcols
finalpreds

Unnamed: 0_level_0,compid,overallreadio_preds,overallwriteio_preds,readdatarate_preds,writedatarate_preds,readresponsetime_preds,writeresponsetime_preds
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-20,0,19.073971,44.109894,0.516337,0.393048,1.006856,0.313238
2020-04-21,0,19.888210,42.391369,0.517538,0.373375,1.108838,0.319549
2020-04-22,0,17.674850,48.515800,0.397969,0.425908,1.154171,0.449505
2020-04-23,0,19.499998,41.656425,0.381544,0.385816,0.990301,0.283002
2020-04-24,0,21.189205,43.637123,0.487861,0.384859,1.061763,0.335481
...,...,...,...,...,...,...,...
2020-10-01,250,23.463127,44.581451,0.619000,0.612868,14.679809,3.778234
2020-10-02,250,29.784069,50.361458,0.745858,0.709597,13.880250,4.030166
2020-10-03,250,45.130028,39.641624,0.914884,0.624187,14.340101,4.234398
2020-10-04,250,201.909821,45.321743,1.471455,0.706312,9.568090,2.877988


In [67]:
data

Unnamed: 0_level_0,overallreadio,overallwriteio,readdatarate,writedatarate,readresponsetime,writeresponsetime,volumeutilization,compid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-04-20,9.007279,46.067494,0.318647,0.325217,0.785313,0.261293,0.018312,0
2020-04-21,16.445833,48.191968,0.614640,0.350569,0.996410,0.237400,0.031167,0
2020-04-22,17.789317,48.226667,0.621631,0.351863,1.006543,0.252280,0.036475,0
2020-04-23,19.118241,47.792164,0.777063,0.344211,0.972339,0.236009,0.034542,0
2020-04-24,16.331968,50.440972,0.456025,0.434237,1.142555,0.239600,0.037558,0
...,...,...,...,...,...,...,...,...
2020-10-01,27.478947,41.442118,0.740750,0.653396,15.467581,4.436718,0.332396,250
2020-10-02,37.252824,37.959387,0.598886,0.547148,15.693799,5.086629,0.323046,250
2020-10-03,48.152536,36.143953,0.848292,0.958960,15.420020,4.109935,0.344404,250
2020-10-04,204.791620,37.227685,1.246249,0.535488,8.010298,2.653905,0.386371,250


In [7]:
# #Start training here
# kf = KFold(n_splits=5,shuffle=True,random_state=5)
# perfmse = []
# rtmse = []
# finalpreds = pd.DataFrame()
# rtpreds = pd.DataFrame()
# n_fold = 1
# traincols = xdata.columns
# xdata = xdata.reset_index()
# trainfeatures = ydata.columns
# ydata = ydata.reset_index()
# highmse = np.Inf
# for train_index,test_index in kf.split(xdata):
#     print(f"##### TRAINING ON BATCH {n_fold} #####")
#     x_train,x_val = xdata.loc[train_index,traincols].values,xdata.loc[test_index,traincols].values
#     val_index = xdata.loc[test_index,'date']
#     y_train,y_val = ydata.loc[train_index,trainfeatures].values,ydata.loc[test_index,trainfeatures].values
#     y_train = ss_y.fit_transform(y_train)
#     y_val = ss_y.transform(y_val)
#     simplenn = perfmodel(x_train,y_train)
#     simplenn.compile(loss=losses, optimizer='adam', loss_weights = [0.3,0.3,0.3])
#     simplenn.fit(x_train, [y_train,y_train,y_train], epochs=10, batch_size=128, verbose=0)
#     preds = simplenn.predict(x_val)
#     medianPreds = ss_y.inverse_transform(preds)[1]
#     medianPreds = pd.DataFrame(medianPreds,index=val_index,columns=targets)
#     comps = getcomponents(xdata,test_index)
#     comps.set_index(medianPreds.index,inplace=True)
#     medianPreds = pd.concat([medianPreds,comps],axis=1)
#     finalpreds = finalpreds.append(medianPreds)
#     mse = mean_squared_error(preds[1],y_val)
#     if mse < highmse:
#         highmse = mse
#         save_model(simplenn,model_savepath)
#     perfmse.append(mse)
#     n_fold += 1





##### TRAINING ON BATCH 1 #####
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x00000218C2B3C790>. It was defined in this code:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

This code must contain a single distinguishable lambda. To avoid this problem, define each lambda in a separate expression.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x00000218C2B3C790>. It was defined in this code:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

This code must contain a single distinguishable lambda. To avoid this 

In [21]:
finalpreds

Unnamed: 0_level_0,overallreadio,overallwriteio,readdatarate,writedatarate,compid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-23,20.219511,48.854626,0.424570,0.431760,0
2020-04-25,20.570456,41.970860,0.415212,0.366000,0
2020-04-29,19.420042,50.504704,0.415059,0.431214,0
2020-05-09,19.750097,49.097820,0.433974,0.415799,0
2020-05-11,15.063711,46.057690,0.328517,0.345030,0
...,...,...,...,...,...
2020-09-01,20.693382,36.193619,0.394681,0.542735,250
2020-09-11,28.310204,38.908031,0.580747,0.571129,250
2020-09-14,20.981407,37.357986,0.388061,0.549399,250
2020-09-26,40.644165,35.643070,0.859030,0.640033,250


In [8]:
finalpreds

Unnamed: 0_level_0,overallreadio,overallwriteio,readdatarate,writedatarate,compid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-23,19.120749,44.748367,0.444997,0.414828,0
2020-04-25,18.410688,39.834759,0.361340,0.299259,0
2020-04-29,16.465528,44.805897,0.418490,0.422173,0
2020-05-09,17.546209,48.045841,0.340637,0.302760,0
2020-05-11,17.919821,42.170296,0.510962,0.384643,0
...,...,...,...,...,...
2020-09-01,21.203377,39.771763,0.367978,0.581391,250
2020-09-11,25.403496,39.442066,0.473133,0.656793,250
2020-09-14,21.362007,37.610466,0.368037,0.550943,250
2020-09-26,32.995461,41.493526,0.722738,0.768235,250


In [39]:
#Read Config Text File
configParser = configparser.RawConfigParser()
configPath = '../config/config.txt'
configParser.read(configPath)

paths = dict(configParser.items('FILEPATHS'))
fieldMappings = dict(configParser.items('MAPPINGS'))
defaults = dict(configParser.items('default'))
flows = dict(configParser.items('FLOW'))

inputPath = paths['inputpath']
outputPath = paths['outputpath']
deviceName = defaults['storagetype']
frequency = defaults['frequency']
forecasthorizon = defaults['forecasthorizon']
forecasthorizon = int(forecasthorizon)

accountname = defaults['accountname']

In [40]:
components = ['volume']
component = components[0]

In [41]:
savepath = f"../data/processedOutputs/{accountname}_{component}_processed.csv"
data = pd.read_csv(savepath,low_memory=False)

In [42]:
uniqcomponents = data[['system','pool','volume']].drop_duplicates()
uniqcomponents['compid'] = np.arange(len(uniqcomponents))


In [43]:
#join compid back to original dataframe and drop [system-pool-volume]
data = data.merge(uniqcomponents,on=['system','pool','volume'])
data = data.drop(['system','pool','volume'],axis=1)

In [44]:
def findzerovols(df):
    df['overallio'] = (df['overallreadio'] + df['overallwriteio'])/2
    tmpdf = df.groupby(['compid'])['overallio'].mean().reset_index()
    tmpdf = tmpdf[tmpdf['overallio'] > 0]
    df = df[df['compid'].isin(tmpdf['compid'])]
    df = df.drop(['overallio'],axis=1)
    return df

In [45]:
#find volumes with zero activity and remove from analysis
data = findzerovols(data)

In [46]:
#change date formats and set data as index
data['date'] = data['date'].apply(lambda x:datetime.datetime.strptime(x,'%Y-%m-%d'))
data.set_index('date', inplace = True)
data

Unnamed: 0_level_0,overallreadio,overallwriteio,readdatarate,writedatarate,readresponsetime,writeresponsetime,volumeutilization,compid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-04-20,9.007279,46.067494,0.318647,0.325217,0.785313,0.261293,0.018312,0
2020-04-21,16.445833,48.191968,0.614640,0.350569,0.996410,0.237400,0.031167,0
2020-04-22,17.789317,48.226667,0.621631,0.351863,1.006543,0.252280,0.036475,0
2020-04-23,19.118241,47.792164,0.777063,0.344211,0.972339,0.236009,0.034542,0
2020-04-24,16.331968,50.440972,0.456025,0.434237,1.142555,0.239600,0.037558,0
...,...,...,...,...,...,...,...,...
2020-10-01,27.478947,41.442118,0.740750,0.653396,15.467581,4.436718,0.332396,250
2020-10-02,37.252824,37.959387,0.598886,0.547148,15.693799,5.086629,0.323046,250
2020-10-03,48.152536,36.143953,0.848292,0.958960,15.420020,4.109935,0.344404,250
2020-10-04,204.791620,37.227685,1.246249,0.535488,8.010298,2.653905,0.386371,250


In [47]:
#add date features as independent variables
def adddatefeatures(df):
    df.loc[:,'dayofweek'] = df.index.dayofweek
    df.loc[:,'month'] = df.index.month
    df.loc[:,'isweekend'] = df['dayofweek'].apply(lambda x: 1 if x > 4 else 0)
    return df


def addtimedummies(data):
    cols = ['month','dayofweek']
    for col in cols:
        if col == 'month':
            max_val = 12
            N = max_val -1 #Since month index starts from 1 and dow and hour from 0
        elif col == 'dayofweek':
            max_val = 6
            N = max_val
        else:
            max_val = 23
            N = max_val
        df = list()
        series = data[col]
        for each in series:
            vals = list(np.zeros(N+1,dtype='int'))
            vals[each-1] = 1
            df.append(vals)
        names = [str(col)+'_' + str(x) for x in range(1,N+2)]
        df = pd.DataFrame(df,columns=names)
        df.set_index(data.index,inplace=True)
        data = pd.concat([data,df],axis=1)
    return data

In [48]:
traindata = data.copy()
traindata = adddatefeatures(traindata)

In [49]:
traindata.columns

Index(['overallreadio', 'overallwriteio', 'readdatarate', 'writedatarate',
       'readresponsetime', 'writeresponsetime', 'volumeutilization', 'compid',
       'dayofweek', 'month', 'isweekend'],
      dtype='object')

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [51]:
features = ['compid','dayofweek','month','isweekend']
targets = ['overallreadio','overallwriteio','readdatarate','writedatarate']


In [52]:
traindata = traindata.dropna()

In [53]:
xdata = traindata[features]
ydata = traindata[targets]

In [54]:
rt_features = features + targets
rt_targets = ['readresponsetime','writeresponsetime']
rtxdata = traindata[rt_features]
rtydata = traindata[rt_targets]

In [55]:
ss_x = MinMaxScaler()
ss_rt_x = MinMaxScaler()

In [56]:
ss = StandardScaler()
ss_rt = StandardScaler()

In [57]:
xdata.loc[:,"compid"] = xdata.loc[:,"compid"].astype(str)
rtxdata.loc[:,"compid"] = rtxdata.loc[:,"compid"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [58]:
compdf = pd.get_dummies(xdata[['compid']],prefix='comp')
xdata = pd.concat([xdata,compdf],axis=1)
rtxdata = pd.concat([rtxdata,compdf],axis=1)
traincomps = xdata['compid'].unique()
xdata = xdata.drop(['compid'],axis=1)
rtxdata = rtxdata.drop(['compid'],axis=1)

In [59]:
xdata = addtimedummies(xdata)
rtxdata = addtimedummies(rtxdata)

In [61]:
rtxdata.loc[:,targets] = ss_rt_x.fit_transform(rtxdata[targets])

In [None]:
rtydata = ss_rt.fit_

In [62]:
rtxdata

Unnamed: 0_level_0,dayofweek,month,isweekend,overallreadio,overallwriteio,readdatarate,writedatarate,comp_0,comp_1,comp_10,...,month_10,month_11,month_12,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-20,0,4,0,0.001589,0.019016,0.000939,0.004278,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2020-04-21,1,4,0,0.002901,0.019893,0.001810,0.004611,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2020-04-22,2,4,0,0.003138,0.019907,0.001831,0.004628,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2020-04-23,3,4,0,0.003373,0.019728,0.002289,0.004527,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2020-04-24,4,4,0,0.002881,0.020821,0.001343,0.005711,1,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-01,3,10,0,0.004848,0.017107,0.002182,0.008594,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2020-10-02,4,10,0,0.006572,0.015669,0.001764,0.007197,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2020-10-03,5,10,1,0.008495,0.014920,0.002499,0.012613,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2020-10-04,6,10,1,0.036129,0.015367,0.003671,0.007043,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [20]:
xdata_ = xdata.values
ydata_ = ss.fit_transform(ydata)

In [21]:
from sklearn.model_selection import KFold

In [22]:
# use entity embeddings
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

In [23]:
def q_loss(q,y,f):
    e = (y-f)
    return K.mean(K.maximum(q*e, (q-1)*e), axis=-1)

In [24]:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

In [25]:
#build nn using x_train,y_train

def perfmodel(X,y):
    inputs = layers.Input(shape=(X.shape[1],))
    x = layers.Dense(256, activation='relu')(inputs)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dropout(0.1)(x)
    y = layers.Dense(y.shape[1],activation='linear')(x)
    out10 = layers.Dense(y.shape[1])(x)
    out50 = layers.Dense(y.shape[1])(x)
    out90 = layers.Dense(y.shape[1])(x)
    model = Model(inputs=inputs,outputs=[out10,out50,out90])
    return model

    

In [26]:
kf = KFold(n_splits=5,shuffle=True,random_state=5)
overallmse = []
i = 1
for train_index,test_index in kf.split(xdata_):
    print(f"##### TRAINING ON BATCH {i} #####")
    x_train,x_val = xdata_[train_index],xdata_[test_index]
    y_train,y_val = ydata_[train_index],ydata_[test_index]
    simplenn = perfmodel(x_train,y_train)
    simplenn.compile(loss=losses, optimizer='adam', loss_weights = [0.3,0.3,0.3])
    simplenn.fit(x_train, [y_train,y_train,y_train], epochs=50, batch_size=128, verbose=0)
    preds = simplenn.predict(x_val)
    mse = mean_squared_error(preds[1],y_val)
    overallmse.append(mse)
    i += 1
    
    

##### TRAINING ON BATCH 1 #####
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x0000020BEAE3C0D0>. It was defined in this code:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

This code must contain a single distinguishable lambda. To avoid this problem, define each lambda in a separate expression.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x0000020BEAE3C0D0>. It was defined in this code:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

This code must contain a single distinguishable lambda. To avoid this 

In [29]:
np.mean(overallmse)

0.12613621057441604

In [33]:
finalpreds = simplenn.predict(xdata_)

[array([[-0.40809056, -0.16273104, -0.16880189, -0.3150987 ],
        [-0.39500207, -0.15836746, -0.16357684, -0.3057781 ],
        [-0.39801854, -0.13004641, -0.16276821, -0.2876076 ],
        ...,
        [-0.32276827, -0.21363871, -0.15588981, -0.23624237],
        [ 0.28760922, -0.20199803, -0.12271634, -0.23520541],
        [-0.38474402, -0.21534482, -0.16375345, -0.27052936]],
       dtype=float32),
 array([[-0.3802129 , -0.10564427, -0.15800023, -0.26443815],
        [-0.36741406, -0.10655436, -0.14863388, -0.26514137],
        [-0.36447987, -0.06100872, -0.14462462, -0.22661653],
        ...,
        [-0.25328812, -0.16045086, -0.14022969, -0.16093686],
        [ 0.5706417 , -0.14310724, -0.09142733, -0.16163613],
        [-0.35475397, -0.1678263 , -0.15187776, -0.22165129]],
       dtype=float32),
 array([[-0.3303803 , -0.01527205, -0.13508964, -0.16053253],
        [-0.32022247, -0.02817681, -0.12402595, -0.18620615],
        [-0.30717373,  0.04647027, -0.11434661, -0.1025615

In [37]:
xdata_.shape

(40480, 273)

In [37]:
actpreds = ss.inverse_transform(preds)
actpreds[actpreds < 0]= 0

array([[[1.11565390e+01, 4.02482529e+01, 2.05246821e-01, 2.56720096e-01,
         3.02515533e-02],
        [1.14898367e+01, 4.14370575e+01, 2.21208706e-01, 2.78252631e-01,
         3.01402863e-02],
        [1.47702703e+01, 4.98388443e+01, 3.09550643e-01, 3.83736283e-01,
         3.48964520e-02],
        ...,
        [1.11499701e+01, 2.65570602e+01, 1.53981581e-01, 4.13292915e-01,
         1.96002200e-01],
        [3.49702988e+01, 3.11861477e+01, 3.35030437e-01, 5.60993135e-01,
         2.87299931e-01],
        [1.33780584e+01, 3.17297668e+01, 1.96703330e-01, 4.89643902e-01,
         2.38656044e-01]],

       [[1.55232000e+01, 4.86191483e+01, 3.84534955e-01, 3.65215689e-01,
         5.08963056e-02],
        [1.62584133e+01, 5.04417419e+01, 4.17073131e-01, 3.93345445e-01,
         5.06818481e-02],
        [2.06798611e+01, 6.92480774e+01, 5.35351872e-01, 5.88512301e-01,
         5.79473861e-02],
        ...,
        [1.70057049e+01, 3.43634415e+01, 3.66626382e-01, 5.94877958e-01,
        

In [46]:
lbcols = ['lb_' + str(x) for x in targets]
ubcols = ['ub_' + str(x) for x in targets]

In [56]:
lbdf = pd.DataFrame(actpreds[0],columns=lbcols)
meddf = pd.DataFrame(actpreds[1],columns=targets)
ubdf = pd.DataFrame(actpreds[2],columns=ubcols)

finaldf = pd.concat([lbdf,meddf,ubdf],axis=1)
finaldf

Unnamed: 0,lb_overallreadio,lb_overallwriteio,lb_readdatarate,lb_writedatarate,lb_volumeutilization,overallreadio,overallwriteio,readdatarate,writedatarate,volumeutilization,ub_overallreadio,ub_overallwriteio,ub_readdatarate,ub_writedatarate,ub_volumeutilization
0,11.156539,40.248253,0.205247,0.256720,0.030252,15.523200,48.619148,0.384535,0.365216,0.050896,23.612585,64.772972,0.870549,0.558419,0.089701
1,11.489837,41.437057,0.221209,0.278253,0.030140,16.258413,50.441742,0.417073,0.393345,0.050682,24.614225,67.555786,0.918981,0.595479,0.085147
2,14.770270,49.838844,0.309551,0.383736,0.034896,20.679861,69.248077,0.535352,0.588512,0.057947,31.097868,102.422775,1.137897,0.904525,0.093441
3,267.541260,44.886269,1.505667,0.361453,0.093933,377.256073,55.364307,2.253207,0.502871,0.126536,482.576416,72.056732,3.331450,0.741885,0.164767
4,11.231475,44.025066,0.196866,0.312494,0.025722,15.829047,53.258228,0.350771,0.431009,0.042838,23.641546,71.175514,0.756417,0.637072,0.071944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8091,12.071982,27.337172,0.162532,0.416943,0.210183,18.486273,35.254696,0.414523,0.605142,0.259884,31.293203,48.856091,1.100148,0.891553,0.320675
8092,15.881537,30.795509,0.280783,0.499320,0.243917,23.578863,39.024944,0.679813,0.720183,0.304638,38.833843,53.806515,1.694050,1.061751,0.369077
8093,11.149970,26.557060,0.153982,0.413293,0.196002,17.005705,34.363441,0.366626,0.594878,0.239675,28.806738,47.315517,0.954887,0.858889,0.298287
8094,34.970299,31.186148,0.335030,0.560993,0.287300,55.401752,38.990601,0.696739,0.778006,0.341281,85.045807,52.812675,1.504051,1.096797,0.397841


In [31]:
data

Unnamed: 0_level_0,overallreadio,overallwriteio,readdatarate,writedatarate,readresponsetime,writeresponsetime,volumeutilization,compid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-04-20,9.007279,46.067494,0.318647,0.325217,0.785313,0.261293,0.018312,0
2020-04-21,16.445833,48.191968,0.614640,0.350569,0.996410,0.237400,0.031167,0
2020-04-22,17.789317,48.226667,0.621631,0.351863,1.006543,0.252280,0.036475,0
2020-04-23,19.118241,47.792164,0.777063,0.344211,0.972339,0.236009,0.034542,0
2020-04-24,16.331968,50.440972,0.456025,0.434237,1.142555,0.239600,0.037558,0
...,...,...,...,...,...,...,...,...
2020-10-01,27.478947,41.442118,0.740750,0.653396,15.467581,4.436718,0.332396,250
2020-10-02,37.252824,37.959387,0.598886,0.547148,15.693799,5.086629,0.323046,250
2020-10-03,48.152536,36.143953,0.848292,0.958960,15.420020,4.109935,0.344404,250
2020-10-04,204.791620,37.227685,1.246249,0.535488,8.010298,2.653905,0.386371,250


In [545]:

overallmse = mean_squared_error(y_val, preds[1])
overallmse

0.10972960016767994

In [498]:
#prepare data for next 1 week
#get last date from data
lastdate = traindata.index.max()
#prepare data for next 1 week
from pandas.tseries.offsets import DateOffset

In [499]:
ts = lastdate
futdates = []
for i in range(1,8):
    ts = ts + DateOffset(days=1)
    futdates.append(ts)

In [500]:
comps = pd.DataFrame(traincomps,columns=['compid'])
futdf = pd.DataFrame(futdates,columns=['date'])

In [501]:
comps['key'] = 1
futdf['key'] = 1
result = pd.merge(comps, futdf, on ='key').drop("key", 1)

In [502]:
result.set_index(['date'],inplace=True)

In [503]:
result.head()

Unnamed: 0_level_0,compid
date,Unnamed: 1_level_1
2020-10-06,0
2020-10-07,0
2020-10-08,0
2020-10-09,0
2020-10-10,0


In [504]:
resultdf = adddatefeatures(result)


In [505]:
compdf = pd.get_dummies(result[['compid']],prefix='comp')
resultdf = pd.concat([resultdf,compdf],axis=1)
resultdf = resultdf.drop(['compid'],axis=1)

In [506]:
resultdf = addtimedummies(resultdf)

In [507]:
resultdf.head()

Unnamed: 0_level_0,dayofweek,month,isweekend,comp_0,comp_1,comp_10,comp_100,comp_101,comp_102,comp_103,...,month_10,month_11,month_12,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-06,1,10,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2020-10-07,2,10,0,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2020-10-08,3,10,0,1,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2020-10-09,4,10,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2020-10-10,5,10,1,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [508]:
futresults = simplenn.predict(resultdf)

In [509]:
futresults[futresults < 0] = 0

In [510]:
futresults = ss.inverse_transform(futresults)
futresults = pd.DataFrame(futresults,columns=ydata.columns,index=result.index)

In [511]:
forecastdf = pd.concat([result,futresults],axis=1)
forecastdf = forecastdf.drop(['dayofweek','month','isweekend'],axis=1)
forecastdf['flag'] = 'predicted'


In [512]:
actualdata = data[forecastdf.columns[:-1]]
actualdata.loc[:,'flag'] = 'actual'
actualdata

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0_level_0,compid,overallreadio,overallwriteio,readdatarate,writedatarate,volumeutilization,flag
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-20,0,9.007279,46.067494,0.318647,0.325217,0.018312,actual
2020-04-21,0,16.445833,48.191968,0.614640,0.350569,0.031167,actual
2020-04-22,0,17.789317,48.226667,0.621631,0.351863,0.036475,actual
2020-04-23,0,19.118241,47.792164,0.777063,0.344211,0.034542,actual
2020-04-24,0,16.331968,50.440972,0.456025,0.434237,0.037558,actual
...,...,...,...,...,...,...,...
2020-10-01,250,27.478947,41.442118,0.740750,0.653396,0.332396,actual
2020-10-02,250,37.252824,37.959387,0.598886,0.547148,0.323046,actual
2020-10-03,250,48.152536,36.143953,0.848292,0.958960,0.344404,actual
2020-10-04,250,204.791620,37.227685,1.246249,0.535488,0.386371,actual


In [513]:
finaldf = pd.concat([actualdata,forecastdf],axis=0)
finaldf.loc[:,'compid'] = finaldf.loc[:,'compid'].astype('int')
finaldf = finaldf.sort_values(by=['compid','date'],ascending=[True,True])
finaldf

Unnamed: 0_level_0,compid,overallreadio,overallwriteio,readdatarate,writedatarate,volumeutilization,flag
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-20,0,9.007279,46.067494,0.318647,0.325217,0.018312,actual
2020-04-21,0,16.445833,48.191968,0.614640,0.350569,0.031167,actual
2020-04-22,0,17.789317,48.226667,0.621631,0.351863,0.036475,actual
2020-04-23,0,19.118241,47.792164,0.777063,0.344211,0.034542,actual
2020-04-24,0,16.331968,50.440972,0.456025,0.434237,0.037558,actual
...,...,...,...,...,...,...,...
2020-10-08,250,99.778053,58.757431,3.327581,1.331361,0.321525,predicted
2020-10-09,250,99.778053,58.757431,3.327581,1.331361,0.320054,predicted
2020-10-10,250,99.778053,58.757431,3.327581,1.331361,0.328572,predicted
2020-10-11,250,218.557663,58.757431,3.327581,1.331361,0.427626,predicted


In [514]:
#get back components name
finaldf = finaldf.reset_index()
finaldf = pd.merge(finaldf,uniqcomponents,on='compid')
finaldf = finaldf.drop(['compid'],axis=1)
finaldf

Unnamed: 0,date,overallreadio,overallwriteio,readdatarate,writedatarate,volumeutilization,flag,system,pool,volume
0,2020-04-20,9.007279,46.067494,0.318647,0.325217,0.018312,actual,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
1,2020-04-21,16.445833,48.191968,0.614640,0.350569,0.031167,actual,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
2,2020-04-22,17.789317,48.226667,0.621631,0.351863,0.036475,actual,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
3,2020-04-23,19.118241,47.792164,0.777063,0.344211,0.034542,actual,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
4,2020-04-24,16.331968,50.440972,0.456025,0.434237,0.037558,actual,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15
...,...,...,...,...,...,...,...,...,...,...
42979,2020-10-08,99.778053,58.757431,3.327581,1.331361,0.321525,predicted,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d8
42980,2020-10-09,99.778053,58.757431,3.327581,1.331361,0.320054,predicted,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d8
42981,2020-10-10,99.778053,58.757431,3.327581,1.331361,0.328572,predicted,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d8
42982,2020-10-11,218.557663,58.757431,3.327581,1.331361,0.427626,predicted,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d8


In [515]:
finaldf = finaldf[['date','system','pool','volume','overallreadio','overallwriteio','readdatarate','writedatarate','volumeutilization','flag']]
finaldf

Unnamed: 0,date,system,pool,volume,overallreadio,overallwriteio,readdatarate,writedatarate,volumeutilization,flag
0,2020-04-20,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15,9.007279,46.067494,0.318647,0.325217,0.018312,actual
1,2020-04-21,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15,16.445833,48.191968,0.614640,0.350569,0.031167,actual
2,2020-04-22,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15,17.789317,48.226667,0.621631,0.351863,0.036475,actual
3,2020-04-23,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15,19.118241,47.792164,0.777063,0.344211,0.034542,actual
4,2020-04-24,SVC-2145-SXED0PSVC0001-IBM,SXED0V370001-ACTVQRM,ED0_MGMT01_TPC-d15,16.331968,50.440972,0.456025,0.434237,0.037558,actual
...,...,...,...,...,...,...,...,...,...,...
42979,2020-10-08,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d8,99.778053,58.757431,3.327581,1.331361,0.321525,predicted
42980,2020-10-09,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d8,99.778053,58.757431,3.327581,1.331361,0.320054,predicted
42981,2020-10-10,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d8,99.778053,58.757431,3.327581,1.331361,0.328572,predicted
42982,2020-10-11,SVC-2145-SXED0PSVC0001-IBM,SXED2PXIV0002-T3,ED0_MGMT01_T3-d8,218.557663,58.757431,3.327581,1.331361,0.427626,predicted


In [516]:
traindata.head()

Unnamed: 0_level_0,overallreadio,overallwriteio,readdatarate,writedatarate,readresponsetime,writeresponsetime,volumeutilization,compid,dayofweek,month,isweekend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-04-20,9.007279,46.067494,0.318647,0.325217,0.785313,0.261293,0.018312,0,0,4,0
2020-04-21,16.445833,48.191968,0.61464,0.350569,0.99641,0.2374,0.031167,0,1,4,0
2020-04-22,17.789317,48.226667,0.621631,0.351863,1.006543,0.25228,0.036475,0,2,4,0
2020-04-23,19.118241,47.792164,0.777063,0.344211,0.972339,0.236009,0.034542,0,3,4,0
2020-04-24,16.331968,50.440972,0.456025,0.434237,1.142555,0.2396,0.037558,0,4,4,0


In [517]:
#section to model for response time
features = ['compid','dayofweek','month','isweekend','overallreadio','overallwriteio','readdatarate','writedatarate']
targets = ['readresponsetime','writeresponsetime']
rtxdata = traindata[features]
rtydata = traindata[targets]

In [518]:
rtxdata.shape

(40480, 8)

In [519]:
rtydata.shape

(40480, 2)

In [520]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

rtxdata_ = mms.fit_transform(rtxdata)
ss1 = StandardScaler()
rtydata_ = ss1.fit_transform(rtydata)

In [521]:
rtxtrain,rtxtest,rtytrain,rtytest = train_test_split(rtxdata_,rtydata,test_size=0.2,shuffle=True)

In [522]:
rtmodel = perfmodel(rtxtrain,rtytrain)

In [523]:
rtmodel.compile(loss='mse',optimizer='adam')

In [524]:
h2 = rtmodel.fit(rtxtrain,rtytrain,
                epochs=10,
                batch_size = 128,
                shuffle = True,
                validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
rtmodel