In [1]:
import pandas as pd
import numpy as np
import glob
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn import metrics
from tensorflow.python.data import Dataset
pd.options.display.max_columns = 1200
pd.options.display.max_rows = 1200
tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
### Use a particular month:

# df = pd.read_csv('data/autoperf-2019-01.csv', header=0).sample(frac=1)

### OR use the full year

# li = []
# for filename in glob.glob("/home/luckierdodge/repos/jupyter-notebooks/data/autoperf-201*.csv"):
#     frame = pd.read_csv(filename, header=0)
#     li.append(frame)
# df = pd.concat(li, axis=0, ignore_index=True)

###

In [3]:
# ### Filtering and cleaning the data before processing ###
df.dropna(inplace=True);

###

counts = df.zero_execName.value_counts().to_dict()
df_filtered = pd.DataFrame()
for name in counts:
    if counts[name] >= 1500:
        df_filtered = df_filtered.append(df[df["zero_execName"] == name])

###

df_filtered = df

###
df_filtered = df_filtered.sample(frac=1)

In [4]:
### Use a particular subset of features:

selected_features = df_filtered[[
    "zero_stdMpiTime",
#     "min_stdMpiTime",
#     "max_stdMpiTime",
#     "av_stdMpiTime",
#     "zero_numRanks",
#     "av_numProcessesOnNode",
#     "zero_elapsedTime",
#     "zero_numRanks",
#     "zero_elapsedCycles",
#     "zero_MPI_Barrier_callCount",
]]
features = selected_features.copy()

### OR use all of the features

# features = df.copy()
# features.drop(['zero_execName', 'run_date', 'zero_userName', 'zero_threadMode', 'min_userName', 'min_execName', 'min_threadMode', 'max_execName', 'max_userName', 'max_threadMode', 'av_execName', 'av_userName', 'av_threadMode'], axis=1, inplace=True)

###

# Convert Ints to floats
features = features.astype(np.float32)

# Factorize the targets for softmax classification (trying to predict executable name)
targets = pd.DataFrame()
targets['number'] = pd.factorize(df_filtered['zero_execName'], na_sentinel=1000)[0]

In [5]:
# Turn the pandas dataframe into tf.Keras friendly numpy arrays
def create_arrays(dataframe):
    array = []
    for col in dataframe.columns:
        array.append(dataframe[col].to_numpy())
    return array

In [17]:
# Separate our data into training and test sets

train_size = np.float32(.8)

train_examples = features.head(int(df_filtered.size * train_size)).values
train_targets = targets.head(int(df_filtered.size * train_size)).values
test_examples = features.tail(int(df_filtered.size * (1-train_size))).values
test_targets = targets.tail(int(df_filtered.size * (1-train_size))).values

In [7]:
# Try K-means clustering first

tf.reset_default_graph()

def input_fn():
    return tf.train.limit_epochs(tf.convert_to_tensor(train_examples, dtype=tf.float32), num_epochs=10)

num_clusters = targets.number.unique().size
kmeans = tf.contrib.factorization.KMeansClustering(num_clusters=num_clusters, use_mini_batch=False)

# train
num_iterations = 10
previous_centers = None
for _ in range(num_iterations):
    kmeans.train(input_fn)
    cluster_centers = kmeans.cluster_centers()
#     if previous_centers is not None:
#         print('delta: ', cluster_centers - previous_centers)
#     previous_centers = cluster_centers
    print('score: ', kmeans.score(input_fn))
# print('cluster centers: ', cluster_centers)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

score:  8463070000.0
score:  8429363700.0
score:  8401987600.0
score:  8347183000.0
score:  8336473600.0
score:  8329433000.0
score:  8321448000.0
score:  8314694000.0
score:  8313807400.0
score:  8309520000.0


In [8]:
def input_fn():
    return tf.train.limit_epochs(tf.convert_to_tensor(test_examples, dtype=tf.float32), num_epochs=10)
kmeans.evaluate(input_fn=input_fn)    

{'loss': 8309520000.0, 'score': 8309520000.0, 'global_step': 109}

In [None]:
# Look at predictions on test data

cluster_indices = list(kmeans.predict_cluster_index(input_fn))
clusters = [{} for _ in range(len(cluster_indices))]

for i, point in enumerate(test_examples):
    cluster_index = cluster_indices[i]
    center = cluster_centers[cluster_index]
    if test_targets[i][0] in clusters[cluster_index]:
        clusters[cluster_index][test_targets[i][0]] += 1
    else:
        clusters[cluster_index][test_targets[i][0]] = 1

for cluster in clusters:
    print("Cluster: " + str(cluster))

Cluster: {5: 236, 91: 13, 8: 101, 73: 9, 49: 6, 72: 1, 3: 4, 7: 12, 6: 16, 32: 10, 140: 1, 35: 13, 56: 7, 23: 2, 1: 7, 46: 1, 19: 3, 68: 2, 0: 2, 25: 1, 101: 1, 145: 1, 12: 1, 42: 1, 78: 1}
Cluster: {7: 11, 11: 10, 46: 4, 5: 267, 6: 21, 8: 47, 91: 7, 35: 19, 141: 3, 23: 5, 73: 9, 126: 1, 29: 1, 1: 7, 32: 4, 19: 2, 42: 3, 0: 1, 108: 1, 3: 2, 56: 1, 9: 1, 432: 2, 21: 1, 72: 1, 149: 1, 49: 2, 140: 2, 12: 1}
Cluster: {6: 475, 2: 243, 5: 101, 15: 251, 11: 23, 4: 4, 143: 3, 13: 19, 105: 1, 173: 1, 41: 2, 24: 2, 84: 6, 44: 3, 50: 6, 99: 2, 89: 2, 77: 1, 302: 1, 175: 4, 301: 1, 321: 1, 9: 13, 72: 4, 126: 1, 45: 6, 134: 1, 217: 1, 22: 3, 277: 1, 399: 1, 136: 5, 67: 1, 259: 1, 55: 1, 174: 1, 270: 1, 20: 1, 151: 1}
Cluster: {15: 220, 4: 898, 2: 172, 5: 6, 6: 40, 1: 5, 35: 1, 163: 1, 11: 2}
Cluster: {7: 31, 1: 403, 6: 15, 0: 59, 5: 12, 11: 1, 23: 2, 4: 1}
Cluster: {0: 50, 1: 303, 20: 2, 19: 1, 11: 37, 7: 12, 5: 49, 4: 36, 6: 75, 176: 1, 35: 10, 252: 1, 46: 10, 32: 11, 60: 1, 74: 2, 145: 1, 42: 8, 