In [1]:
import numpy as np
from matplotlib import pyplot as plt
import glob, operator, time, shutil, scipy, sys, pickle
import pandas as pd

from sklearn import preprocessing
from sklearn.manifold import TSNE
import xgboost as xgb
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV

In [37]:
tlog = pd.read_csv('../data/mira_task_logs/ANL-ALCF-TH-MIRA_20160101_20161231.csv').dropna()
jid2exe = {}
jid2tasks = tlog.COBALT_JOBID.value_counts()
for jid, exe in zip(tlog.COBALT_JOBID, tlog.EXECUTABLE_GENID):
    if jid2tasks[jid] > 1:continue
    jid2exe[jid] = exe
print(len(jid2exe), tlog.shape)
with open('jid2exe-17.pkl', 'wb') as handle:
    pickle.dump(jid2exe, handle)

145 (161931, 22)


In [61]:
df19 = pd.read_csv('../data/mira_djc_logs/ANL-ALCF-DJC-MIRA_20160101_20161231.csv')
df19.describe()

Unnamed: 0,COBALT_JOBID,QUEUED_DATE_ID,START_DATE_ID,END_DATE_ID,USERNAME_GENID,PROJECT_NAME_GENID,WALLTIME_SECONDS,RUNTIME_SECONDS,NODES_USED,NODES_REQUESTED,...,IS_SUBBLOCK,IS_SUBBLOCK_ONLY,IS_MULTILOCATION_ONLY,IS_MULTILOCATION_SUBBLOCK,IS_CONSECUTIVE_ONLY,IS_SINGLE_ONLY,IS_NO_TASKS,IS_OTHER,OVERBURN_CORE_HOURS,IS_OVERBURN
count,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0,...,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0,58975.0
mean,831084.182993,20160560.0,20160590.0,20160590.0,45892780000000.0,40277100000000.0,11763.958627,6980.900517,2328.408444,2292.433031,...,0.066723,0.038881,0.014582,0.018025,0.089496,0.783145,0.0,0.055871,24436.94,0.032573
std,88438.897719,638.5715,355.1358,342.4079,30246980000000.0,30943070000000.0,14813.604632,11638.633525,4743.42525,4726.841897,...,0.249544,0.193313,0.119875,0.133041,0.28546,0.412106,0.0,0.229675,274114.6,0.177518
min,640192.0,20151030.0,20151230.0,20160100.0,24105980000.0,415492000000.0,300.0,23.0,512.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,761660.5,20160310.0,20160310.0,20160310.0,20363760000000.0,6994811000000.0,2400.0,689.0,512.0,512.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,823273.0,20160520.0,20160520.0,20160520.0,39882120000000.0,34258860000000.0,3600.0,2250.0,1024.0,1024.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,909772.5,20160900.0,20160900.0,20160900.0,70757820000000.0,65676530000000.0,18000.0,7464.5,2048.0,2048.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,989960.0,20161230.0,20161230.0,20161230.0,99839380000000.0,99455720000000.0,86400.0,114464.0,49152.0,49152.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,12591070.0,1.0


In [39]:
# with open('jid2exe-19.pkl', 'rb') as handle:
#     jid2exe = pickle.load(handle)

In [40]:
def select_job_logs(jlog, top_q=5, top_u=20, top_p=20, top_e=10):
    jlog_sel = jlog[jlog.EXIT_CODE == 0]
    sel_cols = ['QUEUED_TIMESTAMP', 'USERNAME_GENID', 'PROJECT_NAME_GENID', 'QUEUE_NAME', 'WALLTIME_SECONDS',\
                'NODES_REQUESTED', 'START_TIMESTAMP', 'END_TIMESTAMP', 'COBALT_JOBID', 'EXECUTABLE_GENID',\
                'RUNTIME_SECONDS']
    jlog_sel = jlog_sel[sel_cols]
    
    top_queues = jlog_sel.QUEUE_NAME.value_counts().head(top_q).keys()
    _q_mask = jlog_sel.QUEUE_NAME == top_queues[0]
    for _q in top_queues[1:]:
        _q_mask = _q_mask | (jlog_sel.QUEUE_NAME == _q)
    jlog_sel = jlog_sel[_q_mask]
    
    top_usr = jlog_sel.USERNAME_GENID.value_counts().head(top_u).keys()
    _u_mask = jlog_sel.USERNAME_GENID == top_usr[0]
    for _u in top_usr[1:]:
        _u_mask = _u_mask | (jlog_sel.USERNAME_GENID == _u)
    jlog_sel = jlog_sel[_u_mask]
    
    top_prj = jlog_sel.PROJECT_NAME_GENID.value_counts().head(top_p).keys()
    _p_mask = jlog_sel.PROJECT_NAME_GENID == top_prj[0]
    for _p in top_prj[1:]:
        _p_mask = _p_mask | (jlog_sel.PROJECT_NAME_GENID == _p)
    jlog_sel = jlog_sel[_p_mask]
    
    top_exe = jlog_sel.EXECUTABLE_GENID.value_counts().head(top_e).keys()
    _e_mask = jlog_sel.EXECUTABLE_GENID == top_exe[0]
    for _e in top_exe[1:]:
        _e_mask = _e_mask | (jlog_sel.EXECUTABLE_GENID == _e)
    jlog_sel = jlog_sel[_e_mask]
    
#     jlog_sel['RUNTIME_SECONDS'] = [(pd.to_datetime(_e) - pd.to_datetime(_s)).total_seconds() for \
#                                    _s, _e in zip(jlog_sel.START_TIMESTAMP, jlog_sel.END_TIMESTAMP)]
    jlog_sel['QUEUED_HOUR'] = [pd.to_datetime(_dts).round('2h').hour for _dts in jlog_sel.QUEUED_TIMESTAMP.values]
    
    return jlog_sel


def insert_exe_id(jlog, ):
    with open('jid2exe-17.pkl', 'rb') as handle:
        jid2exe = pickle.load(handle)
    _jid_mask = np.array([jid2exe.get(_jid) is not None for _jid in jlog.COBALT_JOBID])
    jlog_exeid = jlog[_jid_mask].copy()
    jlog_exeid['EXECUTABLE_GENID'] = [jid2exe[_jid] for _jid in jlog_exeid.COBALT_JOBID]
    return jlog_exeid

jlog_exeid = insert_exe_id(df19)
sel_jlog = select_job_logs(jlog_exeid, top_q=5, top_u=10, top_p=10, top_e=20)
print(sel_jlog.shape)
sel_jlog.head()

(77, 12)


Unnamed: 0,QUEUED_TIMESTAMP,USERNAME_GENID,PROJECT_NAME_GENID,QUEUE_NAME,WALLTIME_SECONDS,NODES_REQUESTED,START_TIMESTAMP,END_TIMESTAMP,COBALT_JOBID,EXECUTABLE_GENID,RUNTIME_SECONDS,QUEUED_HOUR
7019,2016-02-03 20:58:02,23737846325960,61001179162879,prod-short,21600.0,512.0,2016-02-03 20:58:44,2016-02-04 02:23:46,717583,87380555825019,19502.0,20
10526,2016-02-24 17:44:36,99839380010887,26847220504119,prod-short,14400.0,512.0,2016-02-24 17:45:20,2016-02-24 19:39:29,736988,98084828768896,6849.0,18
10932,2016-02-26 17:12:39,99839380010887,26847220504119,prod-short,3600.0,512.0,2016-02-26 18:27:16,2016-02-26 18:43:10,738692,98084828768896,954.0,18
20997,2016-04-10 21:56:14,21691304129404,67960307499375,prod-short,3600.0,512.0,2016-04-10 21:59:57,2016-04-10 22:03:25,786982,20521633802224,208.0,22
20998,2016-04-10 22:01:08,21691304129404,67960307499375,prod-short,3600.0,512.0,2016-04-10 22:04:27,2016-04-10 22:08:12,786985,20521633802224,225.0,22


In [41]:
sel_jlog.EXECUTABLE_GENID.value_counts().head(10)

20521633802224    60
98084828768896     7
99206348647925     4
4635989155916      1
87380555825019     1
94492462414546     1
21978844107118     1
29672964536432     1
29106894102462     1
Name: EXECUTABLE_GENID, dtype: int64

In [42]:
sel_jlog[sel_jlog.EXECUTABLE_GENID=='6495151625245'].WALLTIME_SECONDS.unique(), sel_jlog[sel_jlog.EXECUTABLE_GENID=='6495151625245'].NODES_REQUESTED.unique()

(array([], dtype=float64), array([], dtype=float64))

In [43]:
def onehot_enc(val):
    u_val = sorted(set(val))
    tbl = {v:i for i, v in enumerate(u_val)}
    ret = np.zeros((len(val), len(u_val)), dtype=np.float32)
    for i, v in enumerate(val):
        ret[i][tbl[v]] = 1
        
    return ret, u_val

def extract_feature(jlog, with_exe=True):
    _qsub_h_enc, _qsub_h_key = onehot_enc(jlog.QUEUED_HOUR.values)
    _usr_enc, _usr_key       = onehot_enc(jlog.USERNAME_GENID.values)
    _proj_enc, _proj_key     = onehot_enc(jlog.PROJECT_NAME_GENID.values)
    _exe_enc, _exe_key       = onehot_enc(jlog.EXECUTABLE_GENID.values)
    _queue_enc, _queue_key   = onehot_enc(jlog.QUEUE_NAME.values)
    _nodes_enc, _nodes_key   = onehot_enc(jlog.NODES_REQUESTED.values.astype(np.uint))

    _qsub_h_key = ['qh%02d' % h for h in _qsub_h_key]
    _usr_key    = ['u%d' % h for h in _usr_key]
    _proj_key   = ['p%d' % h for h in _proj_key]
    _exe_key    = ['e%s' % h for h in _exe_key]
    _nodes_key  = ['nd%05d' % n for n in _nodes_key]
    if with_exe:
        pd_ret      = pd.DataFrame(np.column_stack([_qsub_h_enc, _usr_enc, _proj_enc, _exe_enc, _queue_enc, _nodes_enc]), \
                                   columns=_qsub_h_key + _usr_key + _proj_key + _exe_key + _queue_key + _nodes_key)
    else:
        pd_ret      = pd.DataFrame(np.column_stack([_qsub_h_enc, _usr_enc, _proj_enc, _queue_enc, _nodes_enc]), \
                                   columns=_qsub_h_key + _usr_key + _proj_key + _queue_key + _nodes_key)
    pd_ret['WALLTIME_SECONDS'] = jlog.WALLTIME_SECONDS.values
    pd_ret['RUNTIME_SECONDS']  = jlog.RUNTIME_SECONDS.values
    return pd_ret.astype(np.float32)

pdf_Xy = extract_feature(sel_jlog, with_exe=True)
print(pdf_Xy.shape)
pdf_Xy.head()

(77, 36)


Unnamed: 0,qh00,qh02,qh10,qh14,qh16,qh18,qh20,qh22,u2707863181227,u21691304129404,...,e29672964536432,e87380555825019,e94492462414546,e98084828768896,e99206348647925,prod-long,prod-short,nd00512,WALLTIME_SECONDS,RUNTIME_SECONDS
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,21600.0,19502.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,14400.0,6849.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,3600.0,954.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3600.0,208.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3600.0,225.0


In [44]:
train_x, test_x, train_y, test_y = train_test_split(pdf_Xy.values[:,:-1], pdf_Xy['RUNTIME_SECONDS'].values, \
                                                    test_size=0.2, random_state=2020)

In [45]:
paras = {'n_estimators'    :[100, 200, 300, 400, 500, 600, 1000][-3:],\
         'max_depth'       :[5, 10, 20, 30, 40, 50][-3:],} 

xgb_mdl = xgb.XGBRegressor()
grid = GridSearchCV(xgb_mdl, paras, n_jobs=16, cv=3)

grid.fit(train_x, train_y)

GridSearchCV(cv=3, error_score=nan,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estima...
                                    objective='reg:squarederror',
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
         

In [46]:
pred    = grid.best_estimator_.predict(test_x)
error   = pred - test_y
abs_err = np.abs(error)
rel_err = 100. * abs_err / test_y
print(np.percentile(rel_err, 25), np.percentile(rel_err, 50), np.percentile(rel_err, 75), np.percentile(rel_err, 95))
print(np.percentile(abs_err, 25), np.percentile(abs_err, 50), np.percentile(abs_err, 75), np.percentile(abs_err, 95))

3.035722553730011 4.847411155700684 30.18291139602661 634.3084945678711
6.85693359375 11.0 215.84185028076172 8029.5


In [75]:
((pred) - test_x[:,-1]).tolist()

[-3377.3681640625,
 -3350.001220703125,
 -3377.3681640625,
 -3375.45458984375,
 27.371337890625,
 -3377.3681640625,
 -3375.45458984375,
 -3375.45458984375,
 -16315.0,
 -19684.0,
 -3377.3681640625,
 -3375.45458984375,
 -3377.3681640625,
 -3377.3681640625,
 -3376.705810546875,
 -16315.0]

In [35]:
top_feat_idx = np.argsort(grid.best_estimator_.feature_importances_)[-15:]
pdf_Xy.columns[top_feat_idx]

Index(['e29672964536432', 'qh08', 'qh22', 'qh00', 'qh20', 'nd01024',
       'u42354498776772', 'u40298432363336', 'qh04', 'qh16', 'u11144888639051',
       'qh14', 'u21691304129404', 'prod-capability', 'WALLTIME_SECONDS'],
      dtype='object')

In [36]:
grid.best_estimator_

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=30,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=500, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

## NN based model

In [304]:
class dense(torch.nn.Module):
    def __init__(self, in_sz):
        super().__init__()
        self.dense_ops = []
        dense_out_chs = (2048, 1024, 512, 256)
        dense_in_chs  = (in_sz, ) + dense_out_chs[:-1]
        for ic, oc in zip(dense_in_chs, dense_out_chs):
            self.dense_ops += [
                            torch.nn.Linear(ic, oc),
                            torch.nn.LeakyReLU(negative_slope=0.01),                
            ]
        self.dense_ops += [torch.nn.Linear(dense_out_chs[-1], 1), ]
                
        self.dense_layers = torch.nn.Sequential(*self.dense_ops)
        
    def forward(self, x):
        _out = x
        for layer in self.dense_layers:
            _out = layer(_out)
        return _out


dense_mdl = dense(in_sz=pdf_Xy.shape[1]-1)
# summary(dense_mdl, input_size=(pdf_Xy.shape[1]-1, ), device=torch.device('cpu'), )
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(dense_mdl.parameters(), lr=3e-4)
    
mb_sz = 32
for epoch in range(10001):
    time_it_st = time.time()
    mb_idx = np.random.randint(0, train_x.shape[0], mb_sz)
    X_mb   = torch.from_numpy(train_x[mb_idx, :])
    y_mb   = torch.from_numpy(np.expand_dims(train_y[mb_idx], -1))
    
    optimizer.zero_grad()
    pred = dense_mdl.forward(X_mb)
    loss = criterion(pred, y_mb)
    loss.backward()
    optimizer.step() 
    
    if epoch % 1000 == 0:
        itr_prints = '[Info] @ %.1f Epoch: %05d, gloss: %.2f, elapse: %.2fs/itr' % (\
                     time.time(), epoch, loss.cpu().detach().numpy(), (time.time() - time_it_st), )
        print(itr_prints)

[Info] @ 1582745045.4 Epoch: 00000, gloss: 52490992.00, elapse: 0.03s/itr
[Info] @ 1582745067.1 Epoch: 01000, gloss: 11567170.00, elapse: 0.02s/itr
[Info] @ 1582745090.3 Epoch: 02000, gloss: 4958344.00, elapse: 0.03s/itr
[Info] @ 1582745117.8 Epoch: 03000, gloss: 15598517.00, elapse: 0.03s/itr
[Info] @ 1582745145.3 Epoch: 04000, gloss: 14756550.00, elapse: 0.03s/itr
[Info] @ 1582745170.7 Epoch: 05000, gloss: 13463319.00, elapse: 0.02s/itr
[Info] @ 1582745196.7 Epoch: 06000, gloss: 8976085.00, elapse: 0.03s/itr
[Info] @ 1582745222.9 Epoch: 07000, gloss: 7664085.50, elapse: 0.03s/itr
[Info] @ 1582745249.1 Epoch: 08000, gloss: 11935786.00, elapse: 0.02s/itr
[Info] @ 1582745275.6 Epoch: 09000, gloss: 4264433.00, elapse: 0.03s/itr
[Info] @ 1582745302.7 Epoch: 10000, gloss: 9340953.00, elapse: 0.03s/itr


In [305]:
with torch.no_grad():
    pred= dense_mdl.forward(torch.from_numpy(test_x))
abs_err = np.abs(pred - test_y)
rel_err = 100. * abs_err / test_y
np.percentile(rel_err, 25), np.percentile(rel_err, 50), np.percentile(rel_err, 75)

(26.816526412963867, 54.243309020996094, 89.8796615600586)

## t-SNE embedding study

In [22]:
# tsne_obj = TSNE(n_components=2, random_state=2019)
# X_embedded = tsne_obj.fit_transform(preprocessing.scale(pdf_Xy.values[:, :-1]))
# X_embedded.shape, tsne_obj.kl_divergence_, tsne_obj.n_iter_

pdf_Xy_no_exe = extract_feature(sel_jlog, with_exe=False)
tsne_obj = TSNE(n_components=2, random_state=2019)
X_embedded = tsne_obj.fit_transform(preprocessing.scale(pdf_Xy_no_exe.values[:, :-1]))
X_embedded.shape, tsne_obj.kl_divergence_, tsne_obj.n_iter_



((43, 2), 0.3916412591934204, 999)

In [23]:
def t_sne_vis_by_group(x_emb, exe_idn, topn=3, ofn=None):
    idn_unique, idn_count = np.unique(exe_idn, return_counts=True)
    top_idn = idn_unique[np.argsort(idn_count)[-topn:]]
    plt.figure(figsize=(20, 20))
    colors = ('g', 'b', 'gold', 'yellow', 'tan', 'cyan', 'magenta', 'black', 'orange', 'darkgreen')
    markers= ('x', 'o', '>', '<', 's', 'v', 'H', 'D', '3', '1', '2')
    _other_grp = np.zeros(exe_idn.shape[0], dtype=np.bool)
    for _idx, _idn in enumerate(top_idn):
        _emb_grp = x_emb[exe_idn == _idn]
        plt.plot(_emb_grp[:, 0], _emb_grp[:, 1], markers[_idx % len(markers)], alpha=.8, color=colors[_idx % len(colors)], markersize=6, label = _idn[:])
        _other_grp |= (exe_idn == _idn)
#         print("%s is marked by %s and %s" % (_idn, colors[_idx], markers[_idx]))
    _uncat = x_emb[~_other_grp]
    plt.plot(_uncat[:, 0], _uncat[:, 1], markers[-1], alpha=.8, color='r', markersize=6, label = 'Others')
    
    plt.xlim(left=X_embedded[:, 0].min()*1.05, right=X_embedded[:, 0].max()*1.05)
    plt.ylim(bottom=X_embedded[:, 1].min()*1.05, top=X_embedded[:, 1].max()*1.05)
    plt.xticks([])
    plt.yticks([])
#     plt.legend(bbox_to_anchor=(0., 1.0, 1., .102), ncol=4, loc=3, fancybox=False, framealpha=0.5, fontsize=14)
    if ofn is not None:
        plt.savefig('figure/' + ofn, bbox_inches='tight', quality=99, dpi=300)
    plt.show()
    plt.close()

t_sne_vis_by_group(X_embedded, sel_jlog.EXECUTABLE_GENID, topn=20, ofn='tsne-exename.png')

IndexError: invalid index to scalar variable.

<Figure size 1440x1440 with 0 Axes>