In [45]:
import tensorflow as tf
from keras.preprocessing import sequence
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from scipy.stats import zscore
from matplotlib.pyplot import figure, show
from sklearn import metrics
import io
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import ModelCheckpoint
from keras import regularizers

In [46]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [47]:
import csv
import sys
import pandas as pd
import os
import glob

path  = "/Users/petronillagriffith/desktop/test/Server Traces" 
#path2  = "/Users/petronillagriffith/desktop/test/DATA/data"
filename_wr = os.path.join(path,'*.csv')
#filename_rd = os.path.join(path,'trace.csv')
names = ['Operation', 'TimeStamp', 'ProcessName', 'ThreadID', 'IrpPtr' , 'ByteOffset', 'IOSize','ElapsedTime', 'DiskNum', 'IrpFlags', 'DiskSvcTime', 'IOPri' , ' VolSnap', 'FileObject', ' FileName']

#df = pd.read_csv(filename_rd,engine='python',header=None,skiprows = 73,names=names,na_values=['-1'], index_col=False)


all_files = glob.glob(os.path.join(path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent
print(all_files)
df_from_each_file = (pd.read_csv(f,engine='python',header=None,names=names,na_values=['-1'], index_col=False) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)


concatenated_df.columns = names


headers = list(concatenated_df.columns.values)
print(headers)

#skiprows=lambda x: x % 2 != 0

[]


ValueError: No objects to concatenate

In [48]:
print("Before drop: {}".format(concatenated_df.columns))

concatenated_df.drop(concatenated_df.columns[[4, 8,9,10,11,12,13,14]], axis=1, inplace=True)
print("After drop: {}".format(concatenated_df.columns))

Before drop: Index(['Operation', 'TimeStamp', 'ProcessName', 'ThreadID', 'ByteOffset',
       'IOSize', 'ElapsedTime'],
      dtype='object')


IndexError: index 8 is out of bounds for axis 1 with size 7

In [49]:
df = concatenated_df

In [23]:
print("Starting file:")
print(df[0:10])

print("Ending file:")
print(df[-10:])

Starting file:
                 Operation  TimeStamp  ProcessName  ThreadID   ByteOffset  \
0                Operation  TimeStamp  ProcessName  ThreadID       IOSize   
1                Operation  TimeStamp  ProcessName  ThreadID  ElapsedTime   
2                Operation  TimeStamp  ProcessName  ThreadID          NaN   
3                Operation  TimeStamp  ProcessName  ThreadID          NaN   
4                Operation  TimeStamp  ProcessName  ThreadID          NaN   
5                Operation  TimeStamp  ProcessName  ThreadID          NaN   
6                Operation  TimeStamp  ProcessName  ThreadID          NaN   
7                Operation  TimeStamp  ProcessName  ThreadID          NaN   
8                 DiskRead     153302      p1 ( 4)        36          NaN   
9                 DiskRead     153756   p0 (21316)      7552          NaN   

        IOSize  ElapsedTime  
0  ElapsedTime          NaN  
1          NaN          NaN  
2          NaN          NaN  
3          NaN   

In [24]:
df.shape

(16472, 7)

In [25]:
print("Ending file:")
print(concatenated_df[-10:])

Ending file:
                     Operation  TimeStamp  ProcessName ThreadID   ByteOffset  \
16462                 DiskRead  905032689   p0 (20388)    16912   0x00001000   
16463                 DiskRead  905034389   p0 (20388)    16912   0x00001000   
16464                 DiskRead  905040743   p0 (20388)    16912   0x00001000   
16465                 DiskRead  905047900   p0 (20388)    16912   0x00001000   
16466                 DiskRead  905048447   p0 (20388)    16912   0x00001000   
16467                DiskWrite  905050383      p1 ( 4)       32   0x00000800   
16468                 DiskRead  905055080   p0 (20388)    16912   0x00001000   
16469                 DiskRead  905059849   p0 (20388)    16912   0x00001000   
16470                 DiskRead  905064475   p0 (20388)    16912   0x00001000   
16471                 DiskRead  905069702   p0 (20388)    16912   0x00001000   

      IOSize  ElapsedTime  
16462   5044          NaN  
16463   1525          NaN  
16464   5635          

In [26]:
df.to_csv(filename_wr,index=False)
df = pd.read_csv(filename_wr,engine='python',index_col=False)
df.dtypes

Operation       object
TimeStamp       object
ProcessName     object
ThreadID        object
ByteOffset      object
IOSize          object
ElapsedTime    float64
dtype: object

In [27]:
for index, row in df.iterrows():
    a2 = row['Operation'].strip()
    if (a2 == 'DiskRead'):
        df.at[index, 'Operation'] = 1
    if (a2 == 'DiskWrite'):
        df.at[index, 'Operation'] = 2
    if (a2 == 'DiskFlush'):
        df.at[index, 'Operation'] = 3
        
 #   x = row['IOSize'].strip()
   # dec = int(x, 16)
    #print(dec)
   # df.at[index,'IOSize'] = dec
   # s.add(dec)

In [28]:
df['IOSize'] = df['IOSize'].apply(int, base=0)

ValueError: invalid literal for int() with base 0: 'ElapsedTime'

In [30]:
for index, row in df.iterrows():
    a7 = row['ProcessName'].strip()
    x = a7.find("(")
    y =  a7.find(")")
    df.at[index, 'ProcessName'] = int(a7[x+1:y])

ValueError: invalid literal for int() with base 10: 'ProcessNam'

In [31]:
df.Operation = df.Operation.astype(int)

ValueError: invalid literal for int() with base 10: 'Operation'

In [32]:
dfByteOffset = df.ByteOffset.astype(int)

ValueError: invalid literal for int() with base 10: 'IOSize'

In [33]:
df.dtypes

Operation       object
TimeStamp       object
ProcessName     object
ThreadID        object
ByteOffset      object
IOSize          object
ElapsedTime    float64
dtype: object

In [34]:
x = (len(df)*.7)
print(int(x))
pt_to_split = int(x)

11530


In [35]:
from sklearn.model_selection import train_test_split

x, y = to_xy(df,'Operation')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.001, random_state=99)



ValueError: could not convert string to float: ' 0x060004'

In [36]:
path2  = "/Users/petronillagriffith/desktop/test/DATA/data/output"
filename_wr_1 = os.path.join(path2,'output.csv')
df.to_csv(filename_wr_1,index=False)
df = pd.read_csv(filename_wr_1,engine='python',index_col=False)

In [37]:
df['ElapsedTime'] = df['ElapsedTime'].apply(int, base=0)

TypeError: int() can't convert non-string with explicit base

In [38]:
df_train = df[:pt_to_split]
df_test = df[pt_to_split+1:]

spots_train = df_train['ByteOffset'].tolist()
spots_test = df_test['ByteOffset'].tolist()

print("Training set has {} observations.".format(len(spots_train)))
print("Test set has {} observations.".format(len(spots_test)))

Training set has 11530 observations.
Test set has 4941 observations.


In [39]:
import numpy as np

def to_sequences(seq_size, obs):
    x = []
    y = []
    
    for i in range(len(obs)-SEQUENCE_SIZE-1):
        #print(i)
        window = obs[i: (i+SEQUENCE_SIZE)]
        after_window = obs[i+SEQUENCE_SIZE]
        window = [[x] for x in window]
        x.append(window)
        y.append(after_window)
    return np.array(x),np.array(y)

SEQUENCE_SIZE = 10
x_train,y_train = to_sequences(SEQUENCE_SIZE,spots_train)
x_test,y_test = to_sequences(SEQUENCE_SIZE,spots_test)

print("Shape of training set: {}".format(x_train.shape))
print("Shape of test set: {}".format(x_test.shape))

Shape of training set: (11519, 10, 1)
Shape of test set: (4930, 10, 1)


In [40]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from keras.callbacks import EarlyStopping
import numpy as np
from keras.callbacks import ModelCheckpoint

print('Build model...')
model = Sequential()
model.add(LSTM(64, dropout=0.0, recurrent_dropout=0.0, input_shape=(None, 1)))
model.add(Dense(32))
model.add(Dense(1))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
print('Train...')
checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=1, save_best_only=True)

model.fit(x_train,y_train,validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=1000)
model.load_weights('best_weights.hdf5')

Build model...
Train...
Train on 11519 samples, validate on 4930 samples
Epoch 1/1000


InvalidArgumentError: Received a label value of -9223372036854775808 which is outside the valid range of [0, 1).  Label values: -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808
	 [[Node: loss_1/dense_4_loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits = SparseSoftmaxCrossEntropyWithLogits[T=DT_FLOAT, Tlabels=DT_INT64, _device="/job:localhost/replica:0/task:0/cpu:0"](loss_1/dense_4_loss/Reshape_1, loss_1/dense_4_loss/Cast)]]

Caused by op 'loss_1/dense_4_loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits', defined at:
  File "/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "/anaconda3/lib/python3.6/asyncio/base_events.py", line 422, in run_forever
    self._run_once()
  File "/anaconda3/lib/python3.6/asyncio/base_events.py", line 1432, in _run_once
    handle._run()
  File "/anaconda3/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 117, in _handle_events
    handler_func(fileobj, events)
  File "/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-7f9d37794292>", line 15, in <module>
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
  File "/anaconda3/lib/python3.6/site-packages/keras/models.py", line 863, in compile
    **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 830, in compile
    sample_weight, mask)
  File "/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 429, in weighted
    score_array = fn(y_true, y_pred)
  File "/anaconda3/lib/python3.6/site-packages/keras/losses.py", line 73, in sparse_categorical_crossentropy
    return K.sparse_categorical_crossentropy(y_true, y_pred)
  File "/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 3047, in sparse_categorical_crossentropy
    logits=logits)
  File "/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 1742, in sparse_softmax_cross_entropy_with_logits
    precise_logits, labels, name=name)
  File "/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 2418, in _sparse_softmax_cross_entropy_with_logits
    features=features, labels=labels, name=name)
  File "/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Received a label value of -9223372036854775808 which is outside the valid range of [0, 1).  Label values: -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808 -9223372036854775808
	 [[Node: loss_1/dense_4_loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits = SparseSoftmaxCrossEntropyWithLogits[T=DT_FLOAT, Tlabels=DT_INT64, _device="/job:localhost/replica:0/task:0/cpu:0"](loss_1/dense_4_loss/Reshape_1, loss_1/dense_4_loss/Cast)]]


In [41]:
pred = model.predict(x_test)
pred = np.argmax(pred, axis=1)

ValueError: could not convert string to float: ' 0x060004'

In [42]:
from sklearn import metrics

y_compare = np.argmax(y_test, axis=1)
score = metrics.accuracy_score(y_compare, pred)
print("Accurary Score: {}".format(score))

ValueError: axis(=1) out of bounds

In [43]:
print("Done!")

Done!


In [44]:
df.head(10)

Unnamed: 0,Operation,TimeStamp,ProcessName,ThreadID,ByteOffset,IOSize,ElapsedTime
0,Operation,TimeStamp,ProcessName,ThreadID,IOSize,ElapsedTime,
1,Operation,TimeStamp,ProcessName,ThreadID,ElapsedTime,,
2,Operation,TimeStamp,ProcessName,ThreadID,,,
3,Operation,TimeStamp,ProcessName,ThreadID,,,
4,Operation,TimeStamp,ProcessName,ThreadID,,,
5,Operation,TimeStamp,ProcessName,ThreadID,,,
6,Operation,TimeStamp,ProcessName,ThreadID,,,
7,Operation,TimeStamp,ProcessName,ThreadID,,,
8,1,153302,p1 ( 4),36,,,
9,1,153756,p0 (21316),7552,,,
