In [1]:
import pandas as pd
import datetime
import numpy as np

In [2]:
path = "../../../APMM storage/"
name = "Storage_Volume_Metrics_Hourly_2.csv"
filename = str(path) + str(name)

In [3]:
name_1 = "Storage_Volume_Metrics_Hourly_1.csv"
filename_1 = str(path) + str(name_1)

In [4]:
data = pd.read_csv(filename,encoding='utf-16',sep="\t")
data1 = pd.read_csv(filename_1,encoding='utf-16',sep="\t")
data = pd.concat([data,data1],axis=0)

del data1

In [5]:
data = data.drop_duplicates()

In [6]:
data['Hour'] = data['Hour'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))

In [7]:
compcounts = data.groupby(['Storage Volume Name'])['Hour'].count().reset_index()
compcounts = compcounts[compcounts['Hour'] >= 720]

In [8]:
#retain only volumes from compcounts
data = data[data['Storage Volume Name'].isin(compcounts['Storage Volume Name'])]

In [9]:
def pre_process(data, threshold = 50):
    #data preps
    #check for duplicate entries
    duplicate_rows_df = data[data.duplicated()]
    #print("number of duplicate rows: ", duplicate_rows_df.shape)
    #find missing values
    missing_stats = pd.DataFrame(data.isnull().sum()/data.shape[0] * 100, index = None)
    missing_stats.reset_index(inplace = True)

    #Remove columns with more than 50% nulls
    missing_stats.columns = ['Field','Value']
    missing_stats['flag'] = missing_stats['Value'].apply(lambda x: 1 if x > threshold else 0)

    cols_to_rem = missing_stats['Field'][missing_stats['flag'] == 1]
    #print(len(cols_to_rem),"columns will be removed from analysis with missing values more than 50%")
    #print(cols_to_rem)
    data = data.drop(cols_to_rem, axis = 1)
    
    #remove fields with no variability
    #find columns with no variability
    var_stats = pd.DataFrame(data.var())
    var_stats.reset_index(inplace = True)

    var_stats.columns = ['Field','Value']
    var_stats['flag'] = var_stats['Value'].apply(lambda x: 1 if x == 0 else 0)

    cols_to_rem = var_stats[var_stats['flag'] == 1]['Field']
    data = data.drop(cols_to_rem, axis = 1)
    
    return data


In [10]:
data = pre_process(data)

In [11]:
filter_cols = [x for x in data.columns if 'Total' not in x]
filter_cols = [x for x in filter_cols if 'Maximum' not in x]
filter_cols = [x for x in filter_cols if 'Peak' not in x]
filter_cols.remove('Overall Transfer Size (KiB/op)')

In [12]:
data = data[filter_cols]

In [13]:
#create compid for unique components
uniqcomponents = data[['Storage System Name','Storage Volume Name']].drop_duplicates()
uniqcomponents['compid'] = np.arange(len(uniqcomponents))

In [14]:
data = data.merge(uniqcomponents, on = ['Storage System Name','Storage Volume Name'])
data = data.drop(['Storage System Name','Storage Volume Name'],axis=1)

In [15]:
perf_cols = [x for x in data.columns if x != 'Volume Utilization']

In [16]:
perf_df = data[perf_cols]

In [17]:
perf_df = perf_df.drop('Overall Response Time (ms/op)',axis=1)

In [18]:
targets = ['Read Response Time (ms/op)','Write Response Time (ms/op)']

In [19]:
testindex = perf_df.index.max() - 100

In [26]:
perf_df = perf_df.fillna(0)

In [27]:
testdata = perf_df[perf_df.index >= testindex]
data = perf_df[perf_df.index < testindex]

In [28]:
testdata.to_csv(str(path)+'perf_testdata.csv',index=False)

In [29]:
data.set_index('Hour',inplace=True)

In [30]:
#add date features as independent variables
def adddatefeatures(df):
    df.loc[:,'dayofweek'] = df.index.dayofweek
    df.loc[:,'month'] = df.index.month
    df.loc[:,'hour'] = df.index.hour
    return df


def addtimedummies(data):
    cols = ['month','dayofweek','hour']
    for col in cols:
        if col == 'month':
            max_val = 12
            N = max_val -1 #Since month index starts from 1 and dow and hour from 0
        elif col == 'dayofweek':
            max_val = 6
            N = max_val
        else:
            max_val = 23
            N = max_val
        df = list()
        series = data[col]
        for each in series:
            vals = list(np.zeros(N+1,dtype='int'))
            vals[each-1] = 1
            df.append(vals)
        names = [str(col)+'_' + str(x) for x in range(1,N+2)]
        df = pd.DataFrame(df,columns=names)
        df.set_index(data.index,inplace=True)
        data = pd.concat([data,df],axis=1)
    return data


In [31]:
data = adddatefeatures(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [32]:
data.head()

Unnamed: 0_level_0,Overall Read I/O Rate (ops/s),Overall Write I/O Rate (ops/s),Read Data Rate (MiB/s),Write Data Rate (MiB/s),Read Response Time (ms/op),Write Response Time (ms/op),Read Transfer Size (KiB/op),Write Transfer Size (KiB/op),Write Cache Delay I/O Rate (ops/s),Overall Read Cache Hit Percentage,...,Disk to Cache Transfer Rate (ops/s),Cache to Disk Transfer Rate (ops/s),Write Cache Delay Percentage,Read Ahead Percentage of Cache Hits,Overall Host Attributed Response Time Percentage,Nonpreferred Node Usage Percentage,compid,dayofweek,month,hour
Hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-25 14:00:00,0.0,1.043056,0.0,0.004453,0.0,0.327563,0.0,4.372037,0.0,0.0,...,0.333333,12.869167,0.0,0.0,0.000437,0.0,0,3,6,14
2020-06-25 15:00:00,0.0,1.028889,0.0,0.004428,0.0,0.338553,0.0,4.407127,0.0,0.0,...,0.275556,12.790278,0.0,0.0,0.000425,0.0,0,3,6,15
2020-06-25 16:00:00,0.0,1.026271,0.0,0.004422,0.0,0.341866,0.0,4.412056,0.0,0.0,...,0.348023,12.383333,0.0,0.0,0.000419,0.0,0,3,6,16
2020-06-25 17:00:00,0.0,1.031667,0.0,0.004435,0.0,0.308831,0.0,4.402531,0.0,0.0,...,0.231111,12.689444,0.0,0.0,0.000466,0.0,0,3,6,17
2020-06-25 18:00:00,0.0,1.038611,0.0,0.004473,0.0,0.323616,0.0,4.409735,0.0,0.125,...,0.186667,13.172778,0.0,0.0,0.000447,0.0,0,3,6,18


In [36]:
data = addtimedummies(data)


Unnamed: 0_level_0,Overall Read I/O Rate (ops/s),Overall Write I/O Rate (ops/s),Read Data Rate (MiB/s),Write Data Rate (MiB/s),Read Response Time (ms/op),Write Response Time (ms/op),Read Transfer Size (KiB/op),Write Transfer Size (KiB/op),Write Cache Delay I/O Rate (ops/s),Overall Read Cache Hit Percentage,...,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,hour_24
Hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-25 14:00:00,0.0,1.043056,0.0,0.004453,0.0,0.327563,0.0,4.372037,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-06-25 15:00:00,0.0,1.028889,0.0,0.004428,0.0,0.338553,0.0,4.407127,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2020-06-25 16:00:00,0.0,1.026271,0.0,0.004422,0.0,0.341866,0.0,4.412056,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
2020-06-25 17:00:00,0.0,1.031667,0.0,0.004435,0.0,0.308831,0.0,4.402531,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
2020-06-25 18:00:00,0.0,1.038611,0.0,0.004473,0.0,0.323616,0.0,4.409735,0.0,0.125,...,0,0,0,1,0,0,0,0,0,0


In [38]:
data = data.drop(['hour','dayofweek','month'],axis=1)

In [47]:
data = data.reset_index(drop=True)

In [42]:
#create a function between features and targets
train_features = [x for x in data.columns if x not in targets]
train_features = [x for x in data.columns if x not in 'compid']

In [49]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [40]:
ssx = MinMaxScaler()

In [45]:
from sklearn.model_selection import KFold

In [46]:
kf = KFold(n_splits=5,shuffle=True,random_state=5)

In [50]:
ssx = MinMaxScaler()
ssy = StandardScaler()

In [51]:
def perfmodel(X,y):
    inputs = layers.Input(shape=(X.shape[1],))
    x = layers.Dense(256, activation='relu')(inputs)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dropout(0.1)(x)
    y = layers.Dense(y.shape[1],activation='linear')(x)
    out10 = layers.Dense(y.shape[1])(x)
    out50 = layers.Dense(y.shape[1])(x)
    out90 = layers.Dense(y.shape[1])(x)
    model = Model(inputs=inputs,outputs=[out10,out50,out90])
    return model

In [59]:
def q_loss(q,y,f):
    e = (y-f)
    return K.mean(K.maximum(q*e, (q-1)*e), axis=-1)

In [52]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [54]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.models import Model, save_model,load_model
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

In [56]:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

In [60]:
overall_mse = []
overall_mae = []
minmse = np.Inf
folds = 1

for train_index , test_index in kf.split(data):
    print(f"##### TRAINING ON BATCH {folds} #####")
    x_train,x_val = data.loc[train_index,train_features],data.loc[test_index,train_features]
    y_train,y_val = data.loc[train_index,targets],data.loc[test_index,targets]
    
    #scale x and y data
    x_train = ssx.fit_transform(x_train)
    y_train = ssy.fit_transform(y_train)
    
    #fit model
    perf_model = perfmodel(x_train,y_train)
    perf_model.compile(loss=losses,optimizer='adam',loss_weights=[0.3,0.3,0.3])
    perf_model.fit(x_train,[y_train,y_train,y_train],epochs=2,batch_size=256,verbose=1)
    
    #transform val set and make preds
    x_val = ssx.transform(x_val)
    yhat = perf_model.predict(x_val)[1]
    
    #compare preds to y_val
    perf_mse = mean_squared_error(y_val,yhat)
    perf_mae = mean_absolute_error(y_val,yhat)
    overall_mse.append(perf_mse)
    overall_mae.append(perf_mae)
    
    #get least mse and save that model
    if perf_mse <= minmse:
        minmse = perf_mse
        perf_model.save('perf_model.h5')
    
    folds += 1
    
    
    
    

##### TRAINING ON BATCH 1 #####
Epoch 1/2
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x12c5368b0>. It was defined in this code:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

This code must contain a single distinguishable lambda. To avoid this problem, define each lambda in a separate expression.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x12c5368b0>. It was defined in this code:
losses = [lambda y,f: q_loss(0.1,y,f), lambda y,f: q_loss(0.5,y,f), lambda y,f: q_loss(0.9,y,f)]

This code must contain a single distinguishable lambda. To avoid this prob

NameError: name 'yval' is not defined