In [1]:
import pandas as pd
import datetime
import numpy as np

In [2]:
path = "../../../APMM storage/"
name = "Storage_Volume_Metrics_Hourly_2.csv"
filename = str(path) + str(name)

In [3]:
name_1 = "Storage_Volume_Metrics_Hourly_1.csv"
filename_1 = str(path) + str(name_1)

In [4]:
data = pd.read_csv(filename,encoding='utf-16',sep="\t")

In [5]:
data1 = pd.read_csv(filename_1,encoding='utf-16',sep="\t")

In [6]:
data = pd.concat([data,data1],axis=0)

In [7]:
data = data.drop_duplicates()

In [8]:
data['Hour'] = data['Hour'].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))

In [9]:
compcounts = data.groupby(['Storage Volume Name'])['Hour'].count().reset_index()
compcounts = compcounts[compcounts['Hour'] >= 720]

In [10]:
#retain only volumes from compcounts
data = data[data['Storage Volume Name'].isin(compcounts['Storage Volume Name'])]

In [11]:
def pre_process(data, threshold = 50):
    #data preps
    #check for duplicate entries
    duplicate_rows_df = data[data.duplicated()]
    #print("number of duplicate rows: ", duplicate_rows_df.shape)
    #find missing values
    missing_stats = pd.DataFrame(data.isnull().sum()/data.shape[0] * 100, index = None)
    missing_stats.reset_index(inplace = True)

    #Remove columns with more than 50% nulls
    missing_stats.columns = ['Field','Value']
    missing_stats['flag'] = missing_stats['Value'].apply(lambda x: 1 if x > threshold else 0)

    cols_to_rem = missing_stats['Field'][missing_stats['flag'] == 1]
    #print(len(cols_to_rem),"columns will be removed from analysis with missing values more than 50%")
    #print(cols_to_rem)
    data = data.drop(cols_to_rem, axis = 1)
    
    #remove fields with no variability
    #find columns with no variability
    var_stats = pd.DataFrame(data.var())
    var_stats.reset_index(inplace = True)

    var_stats.columns = ['Field','Value']
    var_stats['flag'] = var_stats['Value'].apply(lambda x: 1 if x == 0 else 0)

    cols_to_rem = var_stats[var_stats['flag'] == 1]['Field']
    data = data.drop(cols_to_rem, axis = 1)
    
    return data


In [12]:
data = pre_process(data)

In [13]:
filter_cols = [x for x in data.columns if 'Total' not in x]
filter_cols = [x for x in filter_cols if 'Maximum' not in x]
filter_cols = [x for x in filter_cols if 'Peak' not in x]
filter_cols.remove('Overall Transfer Size (KiB/op)')

In [14]:
data = data[filter_cols]

In [15]:
#create compid for unique components
uniqcomponents = data[['Storage System Name','Storage Volume Name']].drop_duplicates()
uniqcomponents['compid'] = np.arange(len(uniqcomponents))

In [16]:
data = data.merge(uniqcomponents, on = ['Storage System Name','Storage Volume Name'])
data = data.drop(['Storage System Name','Storage Volume Name'],axis=1)

In [17]:
capacity_cols = ['compid','Hour','Volume Utilization']
perf_cols = [x for x in data.columns if x != 'Volume Utilization']

In [18]:
perf_cols

['Hour',
 'Overall Read I/O Rate (ops/s)',
 'Overall Write I/O Rate (ops/s)',
 'Read Data Rate (MiB/s)',
 'Write Data Rate (MiB/s)',
 'Read Response Time (ms/op)',
 'Write Response Time (ms/op)',
 'Overall Response Time (ms/op)',
 'Read Transfer Size (KiB/op)',
 'Write Transfer Size (KiB/op)',
 'Write Cache Delay I/O Rate (ops/s)',
 'Overall Read Cache Hit Percentage',
 'Overall Write Cache Hit Percentage',
 'Disk to Cache Transfer Rate (ops/s)',
 'Cache to Disk Transfer Rate (ops/s)',
 'Write Cache Delay Percentage',
 'Read Ahead Percentage of Cache Hits',
 'Overall Host Attributed Response Time Percentage',
 'Nonpreferred Node Usage Percentage',
 'compid']

In [19]:
capacity_df = data[capacity_cols]
perf_df = data[perf_cols]

In [20]:
del data

In [21]:
capacity_df = capacity_df.sort_values(by=['compid','Hour'])
capacity_df = capacity_df.reset_index(drop=True)


In [22]:
capacity_df = capacity_df.rename(columns={'Hour':'date','Volume Utilization':'volumeutilization'})

In [23]:
def findzerocapacity(df):
    tmpdf = df.groupby(['compid'])['volumeutilization'].mean().reset_index()
    tmpdf = tmpdf[tmpdf['volumeutilization'] > 0]
    zerodf = tmpdf[tmpdf['volumeutilization'] <= 0]
    df = df[df['compid'].isin(tmpdf['compid'])]
    return df,zerodf

In [24]:
capacity_df,zerovols = findzerocapacity(capacity_df)

In [25]:
capacity_df.shape

(1594191, 3)

In [26]:
#take 20% of data for testing
cutoffindex = int(len(capacity_df) * 0.8)
cutoffindex

1275352

In [27]:
#capacity model starts here
#hourly data, take in 12 hours past and predict next 6 hours
sequence_in = 12
sequence_out = 6

def gen_sequence(id_df,seq_in,seq_out,seq_cols):
    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0]
    for start, stop in zip(range(0, num_elements-seq_in-seq_out), range(seq_in, num_elements-seq_out)):
        yield data_matrix[start:stop, :]
        
def gen_labels(id_df,seq_in,seq_out,label):
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    for start,stop in zip(range(seq_in,num_elements-seq_out),range(seq_in+seq_out,num_elements)):
        yield data_matrix[start:stop,:]

In [28]:
X_train,X_test = [],[]
for comps in capacity_df['compid'].unique():
    for sequence in gen_sequence(capacity_df[(capacity_df['compid'] == comps) & (capacity_df.index <= cutoffindex)],sequence_in,sequence_out,['volumeutilization']):
        X_train.append(sequence)
    for sequence in gen_sequence(capacity_df[(capacity_df['compid'] == comps)& (capacity_df.index > cutoffindex)],sequence_in,sequence_out,['volumeutilization']):
        X_test.append(sequence)

In [29]:
y_train,y_test = [],[]

for comps in capacity_df['compid'].unique():
    for sequence in gen_labels(capacity_df[(capacity_df['compid'] == comps) & (capacity_df.index <= cutoffindex)],sequence_in,sequence_out,['volumeutilization']):
        y_train.append(sequence)
    for sequence in gen_labels(capacity_df[(capacity_df['compid'] == comps)& (capacity_df.index > cutoffindex)],sequence_in,sequence_out,['volumeutilization']):
        y_test.append(sequence)

In [30]:
X_train,X_test = np.asarray(X_train),np.asarray(X_test)
y_train,y_test = np.asarray(y_train),np.asarray(y_test)

In [31]:
# use entity embeddings
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [32]:
def build_model(train_x, train_y,h1=100,h2=50):
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # define model
    model = Sequential()
    model.add(layers.LSTM(h1, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(layers.Dense(h2, activation='relu'))
    model.add(layers.Dense(n_outputs))
    model.compile(loss='mse', optimizer='adam')
    return model

In [33]:
simpleseq = build_model(X_train,y_train)

In [34]:
history = simpleseq.fit(X_train,y_train,
                   epochs=20,
                   batch_size=256,
                   validation_split = 0.2,
                   verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

KeyboardInterrupt: 

In [None]:
def build_model_drp(train_x, train_y,h1=100,h2=50):
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # define model
    model = Sequential()
    model.add(layers.LSTM(h1,dropout=0.5, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(layers.Dense(h2, activation='relu'))
    model.add(layers.Dense(n_outputs))
    model.compile(loss='mse', optimizer='adam')
    return model

In [None]:
m2 = build_model_drp(X_train,y_train)

In [None]:
# h2 = m2.fit(X_train,y_train,
#                    epochs=2,
#                    batch_size=128,
#                    validation_split = 0.2,
#                    verbose=1)

In [None]:
def build_model_drp(train_x, train_y,h1=100,h2=50):
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # define model
    model = Sequential()
    model.add(layers.LSTM(h1,dropout=0.5,recurrent_dropout=0.3, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(layers.Dense(h2, activation='relu'))
    model.add(layers.Dense(n_outputs))
    model.compile(loss='mse', optimizer='adam')
    return model

In [None]:
m3 = build_model_drp(X_train,y_train)

In [None]:
# h3 = m3.fit(X_train,y_train,
#                    epochs=2,
#                    batch_size=128,
#                    validation_split = 0.2,
#                    verbose=1)

In [None]:
testpreds = simpleseq.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
ytesteval = y_test.reshape(y_test.shape[0],y_test.shape[1]*y_test.shape[2])
testmse = mean_squared_error(testpreds,ytesteval)
testmse

In [None]:
last_ts_df = capacity_df.groupby(['compid']).tail(sequence_in)

In [None]:
fut_preds = []

for comps in last_ts_df['compid'].unique():
    tmpdf = last_ts_df[(last_ts_df['compid'] == comps)]
    tmpdf = tmpdf['volumeutilization'].values
    tmpdf = tmpdf.reshape(sequence_in,1)
    fut_preds.append(tmpdf)

In [None]:
fut_preds = np.asarray(fut_preds)

In [None]:
futpreds = simpleseq.predict(fut_preds)

In [None]:
maxdate = capacity_df.date.max()

In [None]:
#create future dates
futdates = []
from pandas.tseries.offsets import DateOffset
for i in range(1,sequence_out+1):
    ts = maxdate + DateOffset(hours=i)
    futdates.append(ts)

In [None]:
forecast_df = pd.DataFrame()
for each in range(len(futpreds)):
    xdf = futpreds[each]
    xdf[xdf < 0] = 0
    xdf[xdf > 1] = 1
    xdf = pd.DataFrame(futpreds[each],index=futdates)
    xdf['compid'] = each
    forecast_df = forecast_df.append(xdf)
    

In [None]:
forecast_df.index.name = 'date'
forecast_df.columns = ['volumeutilization','compid']
forecast_df = forecast_df[['compid','volumeutilization']]


In [None]:
forecast_df = forecast_df.reset_index()

In [None]:
capacity_df['flag'] = 'actual'
forecast_df['flag'] = 'predicted'

In [None]:
finaldf = pd.concat([capacity_df,forecast_df],axis=0)
finaldf = finaldf.sort_values(by=['compid','date'])

In [None]:
finaldf = finaldf.merge(uniqcomponents , on = 'compid').drop(['compid'],axis=1)
finaldf = finaldf[['Storage System Name','Storage Volume Name','date','volumeutilization']]
finaldf.to_csv(str(path)+'apmm_capacity.csv',index=False)