In [7]:
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
import numpy as np
from scipy.stats import zscore
from matplotlib.pyplot import figure, show
from sklearn import metrics
import io

In [8]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [40]:
import csv
import sys
import pandas as pd
import os
import glob

path  = "/Users/petronillagriffith/desktop/test/traces" 
#path2  = "/Users/petronillagriffith/desktop/test/DATA/data"
filename_wr = os.path.join(path,'*.csv')
#filename_rd = os.path.join(path,'trace.csv')
names = ['Operation', 'TimeStamp', 'ProcessName', 'ThreadID', 'IrpPtr' , 'ByteOffset', 'IOSize','ElapsedTime', 'DiskNum', 'IrpFlags', 'DiskSvcTime', 'IOPri' , ' VolSnap', 'FileObject', ' FileName']

#df = pd.read_csv(filename_rd,engine='python',header=None,skiprows = 73,names=names,na_values=['-1'], index_col=False)


all_files = glob.glob(os.path.join(path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent
print(all_files)
df_from_each_file = (pd.read_csv(f,engine='python',header=None,names=names,na_values=['-1'], index_col=False) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)


concatenated_df.columns = names



headers = list(concatenated_df.columns.values)
print(headers)

#skiprows=lambda x: x % 2 != 0

['/Users/petronillagriffith/desktop/test/traces/*.csv', '/Users/petronillagriffith/desktop/test/traces/traces.csv']
['Operation', 'TimeStamp', 'ProcessName', 'ThreadID', 'IrpPtr', 'ByteOffset', 'IOSize', 'ElapsedTime', 'DiskNum', 'IrpFlags', 'DiskSvcTime', 'IOPri', ' VolSnap', 'FileObject', ' FileName']


In [41]:
print("Before drop: {}".format(concatenated_df.columns))

concatenated_df.drop(concatenated_df.columns[[4, 8,9,10,11,12,13,14]], axis=1, inplace=True)
print("After drop: {}".format(concatenated_df.columns))

Before drop: Index(['Operation', 'TimeStamp', 'ProcessName', 'ThreadID', 'IrpPtr',
       'ByteOffset', 'IOSize', 'ElapsedTime', 'DiskNum', 'IrpFlags',
       'DiskSvcTime', 'IOPri', ' VolSnap', 'FileObject', ' FileName'],
      dtype='object')
After drop: Index(['Operation', 'TimeStamp', 'ProcessName', 'ThreadID', 'ByteOffset',
       'IOSize', 'ElapsedTime'],
      dtype='object')


In [42]:
df = concatenated_df

In [43]:
df.head(5)

Unnamed: 0,Operation,TimeStamp,ProcessName,ThreadID,ByteOffset,IOSize,ElapsedTime
0,Operation,TimeStamp,ProcessName,ThreadID,IOSize,ElapsedTime,
1,Operation,TimeStamp,ProcessName,ThreadID,ElapsedTime,,
2,Operation,TimeStamp,ProcessName,ThreadID,,,
3,Operation,TimeStamp,ProcessName,ThreadID,,,
4,Operation,TimeStamp,ProcessName,ThreadID,,,


In [44]:
df

Unnamed: 0,Operation,TimeStamp,ProcessName,ThreadID,ByteOffset,IOSize,ElapsedTime
0,Operation,TimeStamp,ProcessName,ThreadID,IOSize,ElapsedTime,
1,Operation,TimeStamp,ProcessName,ThreadID,ElapsedTime,,
2,Operation,TimeStamp,ProcessName,ThreadID,,,
3,Operation,TimeStamp,ProcessName,ThreadID,,,
4,Operation,TimeStamp,ProcessName,ThreadID,,,
5,Operation,TimeStamp,ProcessName,ThreadID,,,
6,DiskRead,153302,p1 ( 4),36,,,
7,DiskRead,153756,p0 (21316),7552,,,
8,DiskRead,160029,p0 (21316),7552,,,
9,DiskRead,160057,p1 ( 4),36,,,


In [45]:
for index, row in df.iterrows():
    a8 = row['IOSize'].strip()
    if (a8 == '0x00001000'):
        df.at[index, 'IOSize'] = 4096
    if (a8 == '0x00008000'):
        df.at[index, 'IOSize'] = 32768
    if (a8 == '0x0000c000'):
        df.at[index, 'IOSize'] = 49152
    if (a8 == '0x00005000'):
        df.at[index, 'IOSize'] = 20480
    if (a8 == '0x00002000'):
        df.at[index, 'IOSize'] = 8192
    if (a8 == '0x00010000'):
        df.at[index, 'IOSize'] = 65536
    if (a8 == '0x00000800'):
        df.at[index, 'IOSize'] = 2048
    if (a8 == '0x00000c00'):
        df.at[index, 'IOSize'] = 3072
    if (a8 == '0x0000d000'):
        df.at[index, 'IOSize'] = 53248
    if (a8 == '0x00018000'):
        df.at[index, 'IOSize'] = 98304
    if (a8 == '0x00007000'):
        df.at[index, 'IOSize'] = 28672
    if (a8 == '0x00006c00'):
        df.at[index, 'IOSize'] = 27648
    if (a8 == '0x00007000'):
        df.at[index, 'IOSize'] = 28672
    if (a8 == '0x0000f000'):
        df.at[index, 'IOSize'] = 61440
    if (a8 == '0x00002400'):
        df.at[index, 'IOSize'] = 9216
    if (a8 == '0x0000fa00'):
        df.at[index, 'IOSize'] = 64000

AttributeError: 'float' object has no attribute 'strip'

In [46]:
for index, row in df.iterrows():
    a7 = row['ByteOffset'].strip()
    if (a7 == '0x01b0130000'):
        df.at[index, 'ByteOffset'] = 7249002496
    if (a7 == '0x01e816f000'):
        df.at[index, 'ByteOffset'] = 8188784640
    if (a7 == '0x01e5bc6000'):
        df.at[index, 'ByteOffset'] = 8149295104

AttributeError: 'float' object has no attribute 'strip'

In [47]:
print("Starting file:")
print(df[0:10])

print("Ending file:")
print(df[-10:])

Starting file:
                 Operation  TimeStamp  ProcessName  ThreadID   ByteOffset  \
0                Operation  TimeStamp  ProcessName  ThreadID       IOSize   
1                Operation  TimeStamp  ProcessName  ThreadID  ElapsedTime   
2                Operation  TimeStamp  ProcessName  ThreadID          NaN   
3                Operation  TimeStamp  ProcessName  ThreadID          NaN   
4                Operation  TimeStamp  ProcessName  ThreadID          NaN   
5                Operation  TimeStamp  ProcessName  ThreadID          NaN   
6                 DiskRead     153302      p1 ( 4)        36          NaN   
7                 DiskRead     153756   p0 (21316)      7552          NaN   
8                 DiskRead     160029   p0 (21316)      7552          NaN   
9                 DiskRead     160057      p1 ( 4)        36          NaN   

        IOSize ElapsedTime  
0  ElapsedTime         NaN  
1          NaN         NaN  
2          NaN         NaN  
3          NaN       

In [48]:
df = concatenated_df

In [49]:
df

Unnamed: 0,Operation,TimeStamp,ProcessName,ThreadID,ByteOffset,IOSize,ElapsedTime
0,Operation,TimeStamp,ProcessName,ThreadID,IOSize,ElapsedTime,
1,Operation,TimeStamp,ProcessName,ThreadID,ElapsedTime,,
2,Operation,TimeStamp,ProcessName,ThreadID,,,
3,Operation,TimeStamp,ProcessName,ThreadID,,,
4,Operation,TimeStamp,ProcessName,ThreadID,,,
5,Operation,TimeStamp,ProcessName,ThreadID,,,
6,DiskRead,153302,p1 ( 4),36,,,
7,DiskRead,153756,p0 (21316),7552,,,
8,DiskRead,160029,p0 (21316),7552,,,
9,DiskRead,160057,p1 ( 4),36,,,


In [50]:
df.shape

(4269, 7)

In [51]:
print("Ending file:")
print(concatenated_df[-10:])

Ending file:
                    Operation  TimeStamp  ProcessName ThreadID     ByteOffset  \
4259                 DiskRead  905032689   p0 (20388)    16912   0x0262984000   
4260                 DiskRead  905034389   p0 (20388)    16912   0x01e478c000   
4261                 DiskRead  905040743   p0 (20388)    16912   0x0246274000   
4262                 DiskRead  905047900   p0 (20388)    16912   0x0250ed8000   
4263                 DiskRead  905048447   p0 (20388)    16912   0x0221e1b000   
4264                DiskWrite  905050383      p1 ( 4)       32   0x039f268000   
4265                 DiskRead  905055080   p0 (20388)    16912   0x02551ae000   
4266                 DiskRead  905059849   p0 (20388)    16912   0x02270d6000   
4267                 DiskRead  905064475   p0 (20388)    16912   0x023cf24000   
4268                 DiskRead  905069702   p0 (20388)    16912   0x01ed54f000   

           IOSize ElapsedTime  
4259   0x00001000        5044  
4260   0x00001000        1525  

In [52]:
df.to_csv(filename_wr,index=False)

In [53]:
df.to_csv(filename_wr,index=False)
df = pd.read_csv(filename_wr,engine='python',index_col=False)
df.dtypes

Operation      object
TimeStamp      object
ProcessName    object
ThreadID       object
ByteOffset     object
IOSize         object
ElapsedTime    object
dtype: object

In [54]:
print(df[10:50])

                  Operation TimeStamp  ProcessName ThreadID ByteOffset IOSize  \
10                 DiskRead    165085   p0 (21316)     7552        NaN    NaN   
11                 DiskRead    171596   p0 (21316)     7552        NaN    NaN   
12                 DiskRead    177048   p0 (21316)     7552        NaN    NaN   
13                 DiskRead    182217   p0 (21316)     7552        NaN    NaN   
14                 DiskRead    184062   p0 (21316)     7552        NaN    NaN   
15                 DiskRead    190951   p0 (21316)     7552        NaN    NaN   
16                 DiskRead    195231   p0 (21316)     7552        NaN    NaN   
17                 DiskRead    201525   p0 (21316)     7552        NaN    NaN   
18                 DiskRead    206308   p0 (21316)     7552        NaN    NaN   
19                 DiskRead    212317   p0 (21316)     7552        NaN    NaN   
20                 DiskRead    217443   p0 (21316)     7552        NaN    NaN   
21                 DiskRead 

In [55]:
df.dtypes

Operation      object
TimeStamp      object
ProcessName    object
ThreadID       object
ByteOffset     object
IOSize         object
ElapsedTime    object
dtype: object

In [56]:
print("Done!")

Done!


In [57]:
df

Unnamed: 0,Operation,TimeStamp,ProcessName,ThreadID,ByteOffset,IOSize,ElapsedTime
0,Operation,TimeStamp,ProcessName,ThreadID,IOSize,ElapsedTime,
1,Operation,TimeStamp,ProcessName,ThreadID,ElapsedTime,,
2,Operation,TimeStamp,ProcessName,ThreadID,,,
3,Operation,TimeStamp,ProcessName,ThreadID,,,
4,Operation,TimeStamp,ProcessName,ThreadID,,,
5,Operation,TimeStamp,ProcessName,ThreadID,,,
6,DiskRead,153302,p1 ( 4),36,,,
7,DiskRead,153756,p0 (21316),7552,,,
8,DiskRead,160029,p0 (21316),7552,,,
9,DiskRead,160057,p1 ( 4),36,,,


In [58]:
for index, row in df.iterrows():
    a2 = row['Operation'].strip()
    if (a2 == 'DiskRead'):
        df.at[index, 'Operation'] = 1
    if (a2 == 'DiskWrite'):
        df.at[index, 'Operation'] = 2
    if (a2 == 'DiskFlush'):
        df.at[index, 'Operation'] = 3
        
 #   x = row['IOSize'].strip()
   # dec = int(x, 16)
    #print(dec)
   # df.at[index,'IOSize'] = dec
   # s.add(dec)

In [59]:
df['ByteOffset'] = df['ByteOffset'].apply(int, base=0)

ValueError: invalid literal for int() with base 0: 'IOSize'

In [None]:
df['IOSize'] = df['IOSize'].apply(int, base=0)

In [None]:
for index, row in df.iterrows():
    a7 = row['ProcessName'].strip()
    x = a7.find("(")
    y =  a7.find(")")
    df.at[index, 'ProcessName'] = int(a7[x+1:y])

In [60]:
df.dtypes

Operation      object
TimeStamp      object
ProcessName    object
ThreadID       object
ByteOffset     object
IOSize         object
ElapsedTime    object
dtype: object

In [61]:
df.head(5)

Unnamed: 0,Operation,TimeStamp,ProcessName,ThreadID,ByteOffset,IOSize,ElapsedTime
0,Operation,TimeStamp,ProcessName,ThreadID,IOSize,ElapsedTime,
1,Operation,TimeStamp,ProcessName,ThreadID,ElapsedTime,,
2,Operation,TimeStamp,ProcessName,ThreadID,,,
3,Operation,TimeStamp,ProcessName,ThreadID,,,
4,Operation,TimeStamp,ProcessName,ThreadID,,,


In [62]:
path2  = "/Users/petronillagriffith/desktop/test/DATA/data/output"
filename_wr_1 = os.path.join(path2,'output.csv')
df.to_csv(filename_wr_1,index=False)
df = pd.read_csv(filename_wr_1,engine='python',index_col=False)

In [63]:
df['ElapsedTime'] = df['ElapsedTime'].apply(int, base=0)

TypeError: int() can't convert non-string with explicit base

In [64]:
df.dtypes

Operation      object
TimeStamp      object
ProcessName    object
ThreadID       object
ByteOffset     object
IOSize         object
ElapsedTime    object
dtype: object

In [65]:
from sklearn.model_selection import train_test_split

x, y = to_xy(df,'Operation')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.001, random_state=99)d



ValueError: could not convert string to float: ' 0x060004'

In [66]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

model = Sequential()
model.add(Dense(100, input_dim=x.shape[1], activation = 'relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(y.shape[1],activation='relu'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=1, mode='auto')
checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=1, save_best_only=True)

model.fit(x,y,validation_data=(x_test,y_test),callbacks=[monitor, checkpointer],verbose=0,epochs=1000)
model.load_weights('best_weights.hdf5')

AttributeError: 'int' object has no attribute 'shape'

In [67]:
import tensorflow as tf

pred = model.predict(x_test)
print (pred [0:5])

NameError: name 'x_test' is not defined

In [68]:
pred = np.argmax(pred,axis=1)
print(pred)

NameError: name 'pred' is not defined

In [69]:
y_compare = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_compare, pred)
print("Final Accuracy: {}".format(score))

NameError: name 'y_test' is not defined