In [1]:
import sys
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
sys.path.insert(1, os.path.join(sys.path[0], '/Users/henryxu/Desktop/Research/prismio'))
from prismio.io_frame import IOFrame
# from prismio.io_frame_plotter import IOFramePlotter
import seaborn as sns
import prismio.models
import prismio

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import prismio.models

## using avg io_time per ranks as metric

In [3]:
models = [
    [LogisticRegression(), 'MinMaxScaler'],
    [LogisticRegression(), 'Original data'],
    [DecisionTreeClassifier(), 'Original data'],
    [KNeighborsClassifier(n_neighbors=3), 'MinMaxScaler'],
    [SVC(kernel="linear", C=0.025), 'MinMaxScaler'],
    [SVC(gamma=2, C=1), 'MinMaxScaler'],
    [GaussianProcessClassifier(1.0 * RBF(1.0)), 'MinMaxScaler'],
    [RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'Original data'],
    [MLPClassifier(hidden_layer_sizes=(100,100), max_iter=1000, alpha=0.1), 'MinMaxScaler'],
    [AdaBoostClassifier(), 'MinMaxScaler'],
    [GaussianNB(), 'Original data'],
    [QuadraticDiscriminantAnalysis(), 'MinMaxScaler']
]

In [22]:
parameters = ['num_proc', 'avg_transfer_size_per_read/write']
iorData = pd.read_csv("/Users/henryxu/Desktop/paper/papers-2021-prismio/data/model/ior-io-info.csv")
iorX, iory = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    iorData,
    metric='avg_io_time_over_rank', 
    optimization='min'
)
minmax_normalized_iorX, iory = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    iorData, 
    metric='avg_io_time_over_rank', 
    optimization='min',
    normalize_independent_variables=MinMaxScaler
)
standardized_iorX, iory = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    iorData, 
    metric='avg_io_time_over_rank', 
    optimization='min',
    normalize_independent_variables=StandardScaler
)

print(iorX)
print(iory)
print("=================================")
print()

haccioData = pd.read_csv("/Users/henryxu/Desktop/io-project/data/analysis/haccio-io-info.csv")
haccioData['avg_transfer_size_per_read/write'] = testData['avg_transfer_size_per_read/write'] // 100000 * 100000
haccioX, haccioy = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    haccioData, 
    metric='avg_io_time_over_rank', 
    optimization='min'
)
minmax_normalized_haccioX, haccioy = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    haccioData, 
    metric='avg_io_time_over_rank', 
    optimization='min',
    normalize_independent_variables=MinMaxScaler
)
standardized_haccioX, haccioy = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    haccioData, 
    metric='avg_io_time_over_rank', 
    optimization='min',
    normalize_independent_variables=StandardScaler
)

print(haccioX)
print(haccioy)
print("=================================")
print()

paradisData = pd.read_csv("/Users/henryxu/Desktop/io-project/data/analysis/paradis-io-info.csv")
paradisData['avg_transfer_size_per_read/write'] = paradisData['avg_transfer_size_per_read/write'] // 10 * 10
paradisX, paradisy = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    paradisData, 
    metric='avg_io_time_over_rank', 
    optimization='min'
)
minmax_normalized_paradisX, paradisy = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    paradisData, 
    metric='avg_io_time_over_rank', 
    optimization='min',
    normalize_independent_variables=MinMaxScaler
)
standardized_paradisX, paradisy = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    paradisData, 
    metric='avg_io_time_over_rank', 
    optimization='min',
    normalize_independent_variables=StandardScaler
)

print(paradisX)
print(paradisy)
print("=================================")
print()

for model in models:
    print(type(model[0]).__name__ + ", " + model[1] + ":")
    if model[1] == 'MinMaxScaler':
        finalIorX = minmax_normalized_iorX
        finalHaccioX = minmax_normalized_haccioX
        finalParadisX = minmax_normalized_paradisX
    elif model[1] == 'StandardScaler':
        finalIorX = standardized_iorX
        finalHaccioX = standardized_haccioX
        finalParadisX = standardized_paradisX
    else:
        finalIorX = iorX
        finalHaccioX = haccioX
        finalParadisX = paradisX
        
    
    mymodel = prismio.models.get_model(finalIorX, iory, model[0], k=5)    
    
    print()
    print()
    print("prediction on traning \(IOR\) data:")
    print(mymodel.predict(finalIorX))
    print(mymodel.score(finalIorX, iory))
    print()
    print()
    
    print("prediction on Haccio data:")
    print(mymodel.predict(finalHaccioX))
    print(mymodel.score(finalHaccioX, haccioy))
    print()
    print()
    
    print("prediction on Paradis data:")
    print(mymodel.predict(finalParadisX))
    print(mymodel.score(finalParadisX, paradisy))
    print()
    print()
    print("=================================")
    print()
    print()

   

[[     64   65536]
 [    128   65536]
 [    256   65536]
 [    512   65536]
 [   1024   65536]
 [     64    4096]
 [    128    4096]
 [    256    4096]
 [    512    4096]
 [   1024    4096]
 [     64 1048576]
 [    128 1048576]
 [    256 1048576]
 [    512 1048576]
 [   1024 1048576]
 [     64 4194304]
 [    128 4194304]
 [    256 4194304]
 [    512 4194304]
 [   1024 4194304]]
['summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'lassen node-local bb' 'lassen node-local bb'
 'lassen node-local bb' 'lassen node-local bb' 'lassen node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb']

[[6.400e+01 4.400e+06]
 [1.280e+02 3.400e+06]
 [2.560e+02 3.000e+06]
 [5.120e+02 2.700e+06]
 [1.024e+03 2.600e+06]
 [6.400e+01 2.900e+06]
 [1.280e+02 

Validation Accuracy -- avg: 0.800, max: 1.000, min: 0.500, std: 0.245


prediction on traning \(IOR\) data:
['summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'lassen node-local bb' 'lassen node-local bb'
 'lassen node-local bb' 'lassen node-local bb' 'lassen node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb']
1.0


prediction on Haccio data:
['summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'lassen node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'lassen node-local bb']
0.4


prediction on Paradis data:
['summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summ

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.d

## using avg io_bandwidth per read/write as metric

In [15]:
models = [
    [LogisticRegression(), 'MinMaxScaler'],
    [LogisticRegression(), 'Original data'],
    [DecisionTreeClassifier(), 'Original data'],
    [KNeighborsClassifier(n_neighbors=3), 'MinMaxScaler'],
    [SVC(kernel="linear", C=0.025), 'MinMaxScaler'],
    [SVC(gamma=2, C=1), 'MinMaxScaler'],
    [GaussianProcessClassifier(1.0 * RBF(1.0)), 'MinMaxScaler'],
    [RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'Original data'],
    [MLPClassifier(hidden_layer_sizes=(100,100), max_iter=1000, alpha=0.1), 'MinMaxScaler'],
    [AdaBoostClassifier(), 'MinMaxScaler'],
    [GaussianNB(), 'Original data'],
    [QuadraticDiscriminantAnalysis(), 'MinMaxScaler']
]

In [18]:
parameters = ['num_proc', 'avg_transfer_size_per_read/write']
data = pd.read_csv("/Users/henryxu/Desktop/paper/papers-2021-prismio/data/model/ior-io-info.csv")
X, y = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    data,
    metric='avg_io_bandwidth_per_read/write', 
    optimization='max'
)
minmax_normalized_X, y = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    data,
    metric='avg_io_bandwidth_per_read/write', 
    optimization='max',
    normalize_independent_variables=MinMaxScaler
)
standardized_X, y = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    data,
    metric='avg_io_bandwidth_per_read/write', 
    optimization='max',
    normalize_independent_variables=StandardScaler
)


print(X)
print(y)
print("=================================")
print()

testData = pd.read_csv("/Users/henryxu/Desktop/io-project/data/analysis/haccio-io-info.csv")
testData['avg_transfer_size_per_read/write'] = testData['avg_transfer_size_per_read/write'] // 100000 * 100000
testX, testy = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    testData, 
    metric='avg_io_bandwidth_per_read/write', 
    optimization='max'
)
minmax_normalized_testX, testy = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    testData, 
    metric='avg_io_bandwidth_per_read/write', 
    optimization='max',
    normalize_independent_variables=MinMaxScaler
)
standardized_testX, testy = prismio.models.prepare_data_select_io_subsystem(
    parameters, 
    testData, 
    metric='avg_io_bandwidth_per_read/write', 
    optimization='max',
    normalize_independent_variables=StandardScaler
)

print(testX)
print(testy)
print("=================================")
print()

for model in models:
    print(type(model[0]).__name__ + ", " + model[1] + ":")
    if model[1] == 'MinMaxScaler':
        finalX = minmax_normalized_X
        finalTestX = minmax_normalized_testX
    elif model[1] == 'StandardScaler':
        finalX = standardized_X
        finaltestX = standardized_testX
    else:
        finalX = X
        finaltestX = testX
    
    mymodel = prismio.models.get_model(finalX, y, model[0], k=5)    
    
    print()
    print()
    print("prediction on traning data:")
    print(mymodel.predict(finalX))
    print(mymodel.score(finalX, y))
    print()
    print()
    
    print("prediction on testing data:")
    print(mymodel.predict(finalTestX))
    print(mymodel.score(finalTestX, testy))
    print()
    print()
    print("=================================")
    print()
    print()
    

[[     64   65536]
 [    128   65536]
 [    256   65536]
 [    512   65536]
 [   1024   65536]
 [     64    4096]
 [    128    4096]
 [    256    4096]
 [    512    4096]
 [   1024    4096]
 [     64 1048576]
 [    128 1048576]
 [    256 1048576]
 [    512 1048576]
 [   1024 1048576]
 [     64 4194304]
 [    128 4194304]
 [    256 4194304]
 [    512 4194304]
 [   1024 4194304]]
['summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'lassen gpfs' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'lassen gpfs' 'lassen gpfs' 'summit gpfs' 'summit gpfs' 'summit gpfs']

[[6.400e+01 4.400e+06]
 [1.280e+02 3.400e+06]
 [2.560e+02 3.000e+06]
 [5.120e+02 2.700e+06]
 [1.024e+03 2.600e+06]
 [6.400e+01 2.900e+06]
 [1.280e+02 2.900e+06]
 [5.120e+02 3.000e+06]
 [1.024e+03 3.000e+06

['summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'lassen gpfs' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'lassen gpfs' 'lassen gpfs' 'summit gpfs' 'summit gpfs' 'summit gpfs']
1.0


prediction on testing data:
['lassen gpfs' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'lassen gpfs'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb']
0.0


GaussianNB, Original data:
Validation Accuracy -- avg: 0.800, max: 1.000, min: 0.500, std: 0.187


prediction on traning data:
['summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-local bb'
 'summit node-local bb' 'summit node-local bb' 'summit node-loc

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
2 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/henryxu/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/henryxu/miniconda3/lib/python3.9/site-packages/sklearn/discriminant_analysis.py", line 869, in fit
    raise ValueError(
ValueError: y has only 1 sample in class lassen gpfs, covariance is ill defined.

------------------------------------------------------