In [1]:
import pandas as pd
import numpy as np
import glob
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn import metrics
from tensorflow.python.data import Dataset
pd.options.display.max_columns = 1200
pd.options.display.max_rows = 1200

In [2]:
### Use a particular month:

df = pd.read_csv('data/autoperf-2019-01.csv', header=0).sample(frac=1)

### OR use the full year

# li = []
# for filename in glob.glob("/home/luckierdodge/repos/jupyter-notebooks/data/autoperf-201*.csv"):
#     frame = pd.read_csv(filename, header=0)
#     li.append(frame)
# df = pd.concat(li, axis=0, ignore_index=True)

###

In [5]:
### Filtering and cleaning the data before processing ###
df.dropna(inplace=True);

###

counts = df.zero_execName.value_counts().to_dict()
df_filtered = pd.DataFrame()
for name in counts:
    if counts[name] >= 1000:
        df_filtered = df_filtered.append(df[df["zero_execName"] == name])

###

# df_filtered = df

###
df_filtered = df_filtered.sample(frac=1)

In [16]:
### Use this cell to explore columns ###

# for col in df_filtered.columns:
# #     if 'time' in col or 'Time' in col:
#         print(col)


# df_filtered.zero_numRanks.describe()
# df_filtered.zero_execName.value_counts().to_dict()

In [37]:
### Use a particular subset of features:

selected_features = df_filtered[[
    "zero_stdMpiTime",
#     "min_stdMpiTime",
#     "max_stdMpiTime",
#     "av_stdMpiTime",
#     "zero_numRanks",
#     "av_numProcessesOnNode",
#     "zero_elapsedTime",
#     "zero_numRanks",
#     "zero_elapsedCycles",
#     "zero_MPI_Barrier_callCount",
]]
features = selected_features.copy()

### OR use all of the features

# features = df.copy()
# features.drop(['zero_execName', 'run_date', 'zero_userName', 'zero_threadMode', 'min_userName', 'min_execName', 'min_threadMode', 'max_execName', 'max_userName', 'max_threadMode', 'av_execName', 'av_userName', 'av_threadMode'], axis=1, inplace=True)

###

# Factorize the targets for softmax classification (trying to predict executable name)
targets = pd.DataFrame()
targets['number'] = pd.factorize(df_filtered['zero_execName'], na_sentinel=-1)[0]
# targets['number'] = pd.factorize(df_filtered['zero_userName'], na_sentinel=-1)[0]

In [38]:
# Turn the pandas dataframe into tf.Keras friendly numpy arrays
def create_arrays(dataframe):
    array = []
    for col in dataframe.columns:
        array.append(dataframe[col].to_numpy())
    return array

In [39]:
# Separate our data into training and test sets

train_size = .8

train_examples = create_arrays(features.head(int(df_filtered.size * train_size)))
train_targets = create_arrays(targets.head(int(df_filtered.size * train_size)))
test_examples = create_arrays(features.tail(int(df_filtered.size * (1-train_size))))
test_targets = create_arrays(targets.tail(int(df_filtered.size * (1-train_size))))

In [40]:
# Create a TF.Keras model, taking into account the number of unique executables we're targetting in our softmax layer

### More complex model to tackle a full year

# model = tf.keras.models.Sequential([
#   tf.keras.layers.Dense(2048, activation='relu', input_shape=(1,)),
#   tf.keras.layers.Dropout(.2),
#   tf.keras.layers.Dense(1024, activation='relu'),
#   tf.keras.layers.Dropout(.2),
#   tf.keras.layers.Dense(targets.number.unique().size, activation='softmax')
# ])

### Single month, simplistic neural network

model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(128, activation='relu', input_shape=(1,)),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(targets.number.unique().size, activation='softmax')
])

###

In [44]:
model.compile(optimizer=tf.train.AdagradOptimizer(0.0005),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'],
            )

In [45]:
model.fit(train_examples, train_targets, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f3bfcf1a320>

In [46]:
test_loss, test_acc = model.evaluate(test_examples, test_targets)
print('Test accuracy:', test_acc)

Test accuracy: 0.89708334
