In [2]:
import pandas as pd
import numpy as np
import os
import pickle as pkl
import h5py
import sys
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.stats import kstest,ks_2samp,pearsonr
import scipy
import glob

  from ._conv import register_converters as _register_converters


In [3]:
data_dir = "./data/clusterdata-2011-2/"

task_events_header = ['timestamp','missing_info', 'job_id','task_index','machine_id','event_type',
                      'user_name','sched_class','priority','cpu_req','ram_req','space_req','diff_machine']

machine_events_header = ['timestamp','machine_id','event_type','platform_id','cpu_cap','mem_cap']
task_events_header_filtered = ['timestamp','event_type',
                      'priority','cpu_req','ram_req','disk_space_req','different_machine','task_key', 'time_taken']

task_attr_header = ['timestamp', 'job_id', 'task_index', 'attr_name', 'attr_val', 'comparison']

In [8]:
count = 0
final_df = pd.DataFrame()

for name in tqdm(glob.glob(data_dir+"task_events/*.csv.gz")):
#     print(name)
    df = pd.read_csv(name, header=None)
    df.columns = task_events_header
    df['task_key'] = df['job_id'].astype(str) +'-'+ df['task_index'].astype(str) 
    df = df[pd.isnull(df['missing_info']) == True]
    df = df.loc[(df['event_type']==0) | (df['event_type']==1)]
    df.drop(
        [
            "user_name", "machine_id", "job_id", "task_index",
            "sched_class", "missing_info"
        ],
        axis=1,
        inplace=True)
    
    df.to_csv(name.replace(".csv", ".filtered.csv"), index=False, compression='gzip')
        
    count += 1
    if count >= 200:
        break
    
print (count, "files processed")


  0%|                                                                                          | 0/500 [00:00<?, ?it/s]
 40%|███████████████████████████████▊                                                | 199/500 [22:48<32:35,  6.50s/it]

200 files processed


In [12]:
count = 0
final_df = pd.DataFrame()

## better idea then iteration, groupby on the [task_key, event_type] and take max time_stamp
## then separate the event==0 and event==4 
## then join and calculate time taken

for name in tqdm(glob.glob(data_dir+"task_events/*.filtered.csv.gz")):
#     print(name)
    df = pd.read_csv(name)
        
#     df = df.loc[df['event_type']==0]

    final_df = final_df.append(df)
    count += 1
    if count == 200:
        break
    
print (count, "files appended")

storename = data_dir+"task_events/"+"appended.h5"
print ("Writing the data to ", storename)

store = pd.HDFStore(storename)
store['df'] = final_df
store.close()

print ("Task Completed")


  0%|                                                                                          | 0/200 [00:00<?, ?it/s]

./data/clusterdata-2011-2/task_events\part-00000-of-00500.filtered.csv.gz



  0%|▍                                                                                 | 1/200 [00:00<02:34,  1.29it/s]

./data/clusterdata-2011-2/task_events\part-00001-of-00500.filtered.csv.gz



  1%|▊                                                                                 | 2/200 [00:00<01:57,  1.69it/s]

./data/clusterdata-2011-2/task_events\part-00002-of-00500.filtered.csv.gz



  2%|█▏                                                                                | 3/200 [00:01<01:40,  1.96it/s]

./data/clusterdata-2011-2/task_events\part-00003-of-00500.filtered.csv.gz



  2%|█▋                                                                                | 4/200 [00:01<01:25,  2.29it/s]

./data/clusterdata-2011-2/task_events\part-00004-of-00500.filtered.csv.gz



  2%|██                                                                                | 5/200 [00:01<01:15,  2.57it/s]

./data/clusterdata-2011-2/task_events\part-00005-of-00500.filtered.csv.gz



  3%|██▍                                                                               | 6/200 [00:02<01:18,  2.48it/s]

./data/clusterdata-2011-2/task_events\part-00006-of-00500.filtered.csv.gz



  4%|██▊                                                                               | 7/200 [00:02<01:19,  2.41it/s]

./data/clusterdata-2011-2/task_events\part-00007-of-00500.filtered.csv.gz



  4%|███▎                                                                              | 8/200 [00:03<01:25,  2.26it/s]

./data/clusterdata-2011-2/task_events\part-00008-of-00500.filtered.csv.gz


  4%|███▋                                                                              | 9/200 [00:03<01:23,  2.28it/s]

./data/clusterdata-2011-2/task_events\part-00009-of-00500.filtered.csv.gz


  5%|████                                                                             | 10/200 [00:04<01:34,  2.01it/s]

./data/clusterdata-2011-2/task_events\part-00010-of-00500.filtered.csv.gz


  6%|████▍                                                                            | 11/200 [00:04<01:39,  1.91it/s]

./data/clusterdata-2011-2/task_events\part-00011-of-00500.filtered.csv.gz


  6%|████▊                                                                            | 12/200 [00:05<01:33,  2.01it/s]

./data/clusterdata-2011-2/task_events\part-00012-of-00500.filtered.csv.gz


  6%|█████▎                                                                           | 13/200 [00:05<01:32,  2.01it/s]

./data/clusterdata-2011-2/task_events\part-00013-of-00500.filtered.csv.gz


  7%|█████▋                                                                           | 14/200 [00:06<01:34,  1.97it/s]

./data/clusterdata-2011-2/task_events\part-00014-of-00500.filtered.csv.gz


  8%|██████                                                                           | 15/200 [00:07<01:54,  1.62it/s]

./data/clusterdata-2011-2/task_events\part-00015-of-00500.filtered.csv.gz


  8%|██████▍                                                                          | 16/200 [00:07<01:57,  1.56it/s]

./data/clusterdata-2011-2/task_events\part-00016-of-00500.filtered.csv.gz


  8%|██████▉                                                                          | 17/200 [00:08<02:01,  1.51it/s]

./data/clusterdata-2011-2/task_events\part-00017-of-00500.filtered.csv.gz


  9%|███████▎                                                                         | 18/200 [00:09<02:05,  1.45it/s]

./data/clusterdata-2011-2/task_events\part-00018-of-00500.filtered.csv.gz


 10%|███████▋                                                                         | 19/200 [00:11<03:04,  1.02s/it]

./data/clusterdata-2011-2/task_events\part-00019-of-00500.filtered.csv.gz


 10%|████████                                                                         | 20/200 [00:14<05:29,  1.83s/it]

./data/clusterdata-2011-2/task_events\part-00020-of-00500.filtered.csv.gz


 10%|████████▌                                                                        | 21/200 [00:19<07:46,  2.61s/it]

./data/clusterdata-2011-2/task_events\part-00021-of-00500.filtered.csv.gz


 11%|████████▉                                                                        | 22/200 [00:24<10:13,  3.45s/it]

./data/clusterdata-2011-2/task_events\part-00022-of-00500.filtered.csv.gz


 12%|█████████▎                                                                       | 23/200 [00:28<10:14,  3.47s/it]

./data/clusterdata-2011-2/task_events\part-00023-of-00500.filtered.csv.gz


 12%|█████████▋                                                                       | 24/200 [00:30<08:52,  3.02s/it]

./data/clusterdata-2011-2/task_events\part-00024-of-00500.filtered.csv.gz


 12%|██████████▏                                                                      | 25/200 [00:32<07:48,  2.68s/it]

./data/clusterdata-2011-2/task_events\part-00025-of-00500.filtered.csv.gz


 13%|██████████▌                                                                      | 26/200 [00:34<07:10,  2.47s/it]

./data/clusterdata-2011-2/task_events\part-00026-of-00500.filtered.csv.gz


 14%|██████████▉                                                                      | 27/200 [00:36<06:50,  2.37s/it]

./data/clusterdata-2011-2/task_events\part-00027-of-00500.filtered.csv.gz


 14%|███████████▎                                                                     | 28/200 [00:37<06:18,  2.20s/it]

./data/clusterdata-2011-2/task_events\part-00028-of-00500.filtered.csv.gz


 14%|███████████▋                                                                     | 29/200 [00:39<05:54,  2.07s/it]

./data/clusterdata-2011-2/task_events\part-00029-of-00500.filtered.csv.gz


 15%|████████████▏                                                                    | 30/200 [00:41<05:41,  2.01s/it]

./data/clusterdata-2011-2/task_events\part-00030-of-00500.filtered.csv.gz


 16%|████████████▌                                                                    | 31/200 [00:43<05:40,  2.01s/it]

./data/clusterdata-2011-2/task_events\part-00031-of-00500.filtered.csv.gz


 16%|████████████▉                                                                    | 32/200 [00:45<05:45,  2.06s/it]

./data/clusterdata-2011-2/task_events\part-00032-of-00500.filtered.csv.gz


 16%|█████████████▎                                                                   | 33/200 [00:47<05:40,  2.04s/it]

./data/clusterdata-2011-2/task_events\part-00033-of-00500.filtered.csv.gz


 17%|█████████████▊                                                                   | 34/200 [00:49<05:40,  2.05s/it]

./data/clusterdata-2011-2/task_events\part-00034-of-00500.filtered.csv.gz


 18%|██████████████▏                                                                  | 35/200 [00:52<06:02,  2.20s/it]

./data/clusterdata-2011-2/task_events\part-00035-of-00500.filtered.csv.gz


 18%|██████████████▌                                                                  | 36/200 [00:54<06:12,  2.27s/it]

./data/clusterdata-2011-2/task_events\part-00036-of-00500.filtered.csv.gz


 18%|██████████████▉                                                                  | 37/200 [00:57<06:11,  2.28s/it]

./data/clusterdata-2011-2/task_events\part-00037-of-00500.filtered.csv.gz


 19%|███████████████▍                                                                 | 38/200 [00:59<06:14,  2.31s/it]

./data/clusterdata-2011-2/task_events\part-00038-of-00500.filtered.csv.gz


 20%|███████████████▊                                                                 | 39/200 [01:01<06:17,  2.35s/it]

./data/clusterdata-2011-2/task_events\part-00039-of-00500.filtered.csv.gz


 20%|████████████████▏                                                                | 40/200 [01:04<06:32,  2.45s/it]

./data/clusterdata-2011-2/task_events\part-00040-of-00500.filtered.csv.gz


 20%|████████████████▌                                                                | 41/200 [01:07<06:36,  2.49s/it]

./data/clusterdata-2011-2/task_events\part-00041-of-00500.filtered.csv.gz


 21%|█████████████████                                                                | 42/200 [01:09<06:24,  2.44s/it]

./data/clusterdata-2011-2/task_events\part-00042-of-00500.filtered.csv.gz


 22%|█████████████████▍                                                               | 43/200 [01:11<06:11,  2.37s/it]

./data/clusterdata-2011-2/task_events\part-00043-of-00500.filtered.csv.gz


 22%|█████████████████▊                                                               | 44/200 [01:14<06:13,  2.40s/it]

./data/clusterdata-2011-2/task_events\part-00044-of-00500.filtered.csv.gz


 22%|██████████████████▏                                                              | 45/200 [01:16<06:02,  2.34s/it]

./data/clusterdata-2011-2/task_events\part-00045-of-00500.filtered.csv.gz


 23%|██████████████████▋                                                              | 46/200 [01:18<05:56,  2.31s/it]

./data/clusterdata-2011-2/task_events\part-00046-of-00500.filtered.csv.gz


 24%|███████████████████                                                              | 47/200 [01:20<05:46,  2.27s/it]

./data/clusterdata-2011-2/task_events\part-00047-of-00500.filtered.csv.gz


 24%|███████████████████▍                                                             | 48/200 [01:23<05:43,  2.26s/it]

./data/clusterdata-2011-2/task_events\part-00048-of-00500.filtered.csv.gz


 24%|███████████████████▊                                                             | 49/200 [01:25<05:41,  2.26s/it]

./data/clusterdata-2011-2/task_events\part-00049-of-00500.filtered.csv.gz


 25%|████████████████████▎                                                            | 50/200 [01:27<05:45,  2.30s/it]

./data/clusterdata-2011-2/task_events\part-00050-of-00500.filtered.csv.gz


 26%|████████████████████▋                                                            | 51/200 [01:30<06:06,  2.46s/it]

./data/clusterdata-2011-2/task_events\part-00051-of-00500.filtered.csv.gz


 26%|█████████████████████                                                            | 52/200 [01:33<06:28,  2.63s/it]

./data/clusterdata-2011-2/task_events\part-00052-of-00500.filtered.csv.gz


 26%|█████████████████████▍                                                           | 53/200 [01:36<06:28,  2.64s/it]

./data/clusterdata-2011-2/task_events\part-00053-of-00500.filtered.csv.gz


 27%|█████████████████████▊                                                           | 54/200 [01:38<06:18,  2.59s/it]

./data/clusterdata-2011-2/task_events\part-00054-of-00500.filtered.csv.gz


 28%|██████████████████████▎                                                          | 55/200 [01:41<06:13,  2.58s/it]

./data/clusterdata-2011-2/task_events\part-00055-of-00500.filtered.csv.gz


 28%|██████████████████████▋                                                          | 56/200 [01:43<06:12,  2.59s/it]

./data/clusterdata-2011-2/task_events\part-00056-of-00500.filtered.csv.gz


 28%|███████████████████████                                                          | 57/200 [01:46<06:11,  2.60s/it]

./data/clusterdata-2011-2/task_events\part-00057-of-00500.filtered.csv.gz


 29%|███████████████████████▍                                                         | 58/200 [01:49<06:05,  2.58s/it]

./data/clusterdata-2011-2/task_events\part-00058-of-00500.filtered.csv.gz


 30%|███████████████████████▉                                                         | 59/200 [01:51<06:00,  2.56s/it]

./data/clusterdata-2011-2/task_events\part-00059-of-00500.filtered.csv.gz


 30%|████████████████████████▎                                                        | 60/200 [01:54<06:01,  2.58s/it]

./data/clusterdata-2011-2/task_events\part-00060-of-00500.filtered.csv.gz


 30%|████████████████████████▋                                                        | 61/200 [01:56<05:58,  2.58s/it]

./data/clusterdata-2011-2/task_events\part-00061-of-00500.filtered.csv.gz


 31%|█████████████████████████                                                        | 62/200 [01:59<06:05,  2.65s/it]

./data/clusterdata-2011-2/task_events\part-00062-of-00500.filtered.csv.gz


 32%|█████████████████████████▌                                                       | 63/200 [02:02<06:01,  2.64s/it]

./data/clusterdata-2011-2/task_events\part-00063-of-00500.filtered.csv.gz


 32%|█████████████████████████▉                                                       | 64/200 [02:04<05:57,  2.63s/it]

./data/clusterdata-2011-2/task_events\part-00064-of-00500.filtered.csv.gz


 32%|██████████████████████████▎                                                      | 65/200 [02:07<06:03,  2.69s/it]

./data/clusterdata-2011-2/task_events\part-00065-of-00500.filtered.csv.gz


 33%|██████████████████████████▋                                                      | 66/200 [02:10<05:58,  2.68s/it]

./data/clusterdata-2011-2/task_events\part-00066-of-00500.filtered.csv.gz


 34%|███████████████████████████▏                                                     | 67/200 [02:12<05:55,  2.67s/it]

./data/clusterdata-2011-2/task_events\part-00067-of-00500.filtered.csv.gz


 34%|███████████████████████████▌                                                     | 68/200 [02:15<05:49,  2.65s/it]

./data/clusterdata-2011-2/task_events\part-00068-of-00500.filtered.csv.gz


 34%|███████████████████████████▉                                                     | 69/200 [02:18<05:48,  2.66s/it]

./data/clusterdata-2011-2/task_events\part-00069-of-00500.filtered.csv.gz


 35%|████████████████████████████▎                                                    | 70/200 [02:20<05:45,  2.65s/it]

./data/clusterdata-2011-2/task_events\part-00070-of-00500.filtered.csv.gz


 36%|████████████████████████████▊                                                    | 71/200 [02:23<05:46,  2.69s/it]

./data/clusterdata-2011-2/task_events\part-00071-of-00500.filtered.csv.gz


 36%|█████████████████████████████▏                                                   | 72/200 [02:26<05:50,  2.74s/it]

./data/clusterdata-2011-2/task_events\part-00072-of-00500.filtered.csv.gz


 36%|█████████████████████████████▌                                                   | 73/200 [02:29<05:52,  2.78s/it]

./data/clusterdata-2011-2/task_events\part-00073-of-00500.filtered.csv.gz


 37%|█████████████████████████████▉                                                   | 74/200 [02:32<05:51,  2.79s/it]

./data/clusterdata-2011-2/task_events\part-00074-of-00500.filtered.csv.gz


 38%|██████████████████████████████▍                                                  | 75/200 [02:34<05:47,  2.78s/it]

./data/clusterdata-2011-2/task_events\part-00075-of-00500.filtered.csv.gz


 38%|██████████████████████████████▊                                                  | 76/200 [02:37<05:46,  2.79s/it]

./data/clusterdata-2011-2/task_events\part-00076-of-00500.filtered.csv.gz


 38%|███████████████████████████████▏                                                 | 77/200 [02:40<05:46,  2.82s/it]

./data/clusterdata-2011-2/task_events\part-00077-of-00500.filtered.csv.gz


 39%|███████████████████████████████▌                                                 | 78/200 [02:43<05:45,  2.84s/it]

./data/clusterdata-2011-2/task_events\part-00078-of-00500.filtered.csv.gz


 40%|███████████████████████████████▉                                                 | 79/200 [02:46<05:46,  2.86s/it]

./data/clusterdata-2011-2/task_events\part-00079-of-00500.filtered.csv.gz


 40%|████████████████████████████████▍                                                | 80/200 [02:49<05:47,  2.90s/it]

./data/clusterdata-2011-2/task_events\part-00080-of-00500.filtered.csv.gz


 40%|████████████████████████████████▊                                                | 81/200 [02:52<05:43,  2.88s/it]

./data/clusterdata-2011-2/task_events\part-00081-of-00500.filtered.csv.gz


 41%|█████████████████████████████████▏                                               | 82/200 [02:55<05:43,  2.91s/it]

./data/clusterdata-2011-2/task_events\part-00082-of-00500.filtered.csv.gz


 42%|█████████████████████████████████▌                                               | 83/200 [02:58<05:43,  2.94s/it]

./data/clusterdata-2011-2/task_events\part-00083-of-00500.filtered.csv.gz


 42%|██████████████████████████████████                                               | 84/200 [03:01<05:40,  2.93s/it]

./data/clusterdata-2011-2/task_events\part-00084-of-00500.filtered.csv.gz


 42%|██████████████████████████████████▍                                              | 85/200 [03:04<05:39,  2.95s/it]

./data/clusterdata-2011-2/task_events\part-00085-of-00500.filtered.csv.gz


 43%|██████████████████████████████████▊                                              | 86/200 [03:07<05:37,  2.96s/it]

./data/clusterdata-2011-2/task_events\part-00086-of-00500.filtered.csv.gz


 44%|███████████████████████████████████▏                                             | 87/200 [03:10<05:42,  3.03s/it]

./data/clusterdata-2011-2/task_events\part-00087-of-00500.filtered.csv.gz


 44%|███████████████████████████████████▋                                             | 88/200 [03:13<05:50,  3.13s/it]

./data/clusterdata-2011-2/task_events\part-00088-of-00500.filtered.csv.gz


 44%|████████████████████████████████████                                             | 89/200 [03:16<05:48,  3.14s/it]

./data/clusterdata-2011-2/task_events\part-00089-of-00500.filtered.csv.gz


 45%|████████████████████████████████████▍                                            | 90/200 [03:19<05:43,  3.13s/it]

./data/clusterdata-2011-2/task_events\part-00090-of-00500.filtered.csv.gz


 46%|████████████████████████████████████▊                                            | 91/200 [03:23<05:38,  3.11s/it]

./data/clusterdata-2011-2/task_events\part-00091-of-00500.filtered.csv.gz


 46%|█████████████████████████████████████▎                                           | 92/200 [03:26<05:41,  3.16s/it]

./data/clusterdata-2011-2/task_events\part-00092-of-00500.filtered.csv.gz


 46%|█████████████████████████████████████▋                                           | 93/200 [03:29<05:42,  3.20s/it]

./data/clusterdata-2011-2/task_events\part-00093-of-00500.filtered.csv.gz


 47%|██████████████████████████████████████                                           | 94/200 [03:32<05:43,  3.24s/it]

./data/clusterdata-2011-2/task_events\part-00094-of-00500.filtered.csv.gz


 48%|██████████████████████████████████████▍                                          | 95/200 [03:36<05:39,  3.23s/it]

./data/clusterdata-2011-2/task_events\part-00095-of-00500.filtered.csv.gz


 48%|██████████████████████████████████████▉                                          | 96/200 [03:39<05:34,  3.21s/it]

./data/clusterdata-2011-2/task_events\part-00096-of-00500.filtered.csv.gz


 48%|███████████████████████████████████████▎                                         | 97/200 [03:42<05:27,  3.18s/it]

./data/clusterdata-2011-2/task_events\part-00097-of-00500.filtered.csv.gz


 49%|███████████████████████████████████████▋                                         | 98/200 [03:45<05:27,  3.22s/it]

./data/clusterdata-2011-2/task_events\part-00098-of-00500.filtered.csv.gz


 50%|████████████████████████████████████████                                         | 99/200 [03:48<05:22,  3.19s/it]

./data/clusterdata-2011-2/task_events\part-00099-of-00500.filtered.csv.gz


 50%|████████████████████████████████████████                                        | 100/200 [03:51<05:17,  3.17s/it]

./data/clusterdata-2011-2/task_events\part-00100-of-00500.filtered.csv.gz


 50%|████████████████████████████████████████▍                                       | 101/200 [03:55<05:18,  3.22s/it]

./data/clusterdata-2011-2/task_events\part-00101-of-00500.filtered.csv.gz


 51%|████████████████████████████████████████▊                                       | 102/200 [03:58<05:16,  3.23s/it]

./data/clusterdata-2011-2/task_events\part-00102-of-00500.filtered.csv.gz


 52%|█████████████████████████████████████████▏                                      | 103/200 [04:01<05:14,  3.24s/it]

./data/clusterdata-2011-2/task_events\part-00103-of-00500.filtered.csv.gz


 52%|█████████████████████████████████████████▌                                      | 104/200 [04:05<05:22,  3.36s/it]

./data/clusterdata-2011-2/task_events\part-00104-of-00500.filtered.csv.gz


 52%|██████████████████████████████████████████                                      | 105/200 [04:08<05:15,  3.32s/it]

./data/clusterdata-2011-2/task_events\part-00105-of-00500.filtered.csv.gz


 53%|██████████████████████████████████████████▍                                     | 106/200 [04:11<05:07,  3.27s/it]

./data/clusterdata-2011-2/task_events\part-00106-of-00500.filtered.csv.gz


 54%|██████████████████████████████████████████▊                                     | 107/200 [04:15<05:07,  3.30s/it]

./data/clusterdata-2011-2/task_events\part-00107-of-00500.filtered.csv.gz


 54%|███████████████████████████████████████████▏                                    | 108/200 [04:18<05:08,  3.35s/it]

./data/clusterdata-2011-2/task_events\part-00108-of-00500.filtered.csv.gz


 55%|███████████████████████████████████████████▌                                    | 109/200 [04:22<05:03,  3.34s/it]

./data/clusterdata-2011-2/task_events\part-00109-of-00500.filtered.csv.gz


 55%|████████████████████████████████████████████                                    | 110/200 [04:25<05:01,  3.35s/it]

./data/clusterdata-2011-2/task_events\part-00110-of-00500.filtered.csv.gz


 56%|████████████████████████████████████████████▍                                   | 111/200 [04:28<04:54,  3.31s/it]

./data/clusterdata-2011-2/task_events\part-00111-of-00500.filtered.csv.gz


 56%|████████████████████████████████████████████▊                                   | 112/200 [04:31<04:50,  3.30s/it]

./data/clusterdata-2011-2/task_events\part-00112-of-00500.filtered.csv.gz


 56%|█████████████████████████████████████████████▏                                  | 113/200 [04:35<04:49,  3.33s/it]

./data/clusterdata-2011-2/task_events\part-00113-of-00500.filtered.csv.gz


 57%|█████████████████████████████████████████████▌                                  | 114/200 [04:38<04:46,  3.33s/it]

./data/clusterdata-2011-2/task_events\part-00114-of-00500.filtered.csv.gz


 57%|██████████████████████████████████████████████                                  | 115/200 [04:41<04:42,  3.33s/it]

./data/clusterdata-2011-2/task_events\part-00115-of-00500.filtered.csv.gz


 58%|██████████████████████████████████████████████▍                                 | 116/200 [04:45<04:40,  3.34s/it]

./data/clusterdata-2011-2/task_events\part-00116-of-00500.filtered.csv.gz


 58%|██████████████████████████████████████████████▊                                 | 117/200 [04:48<04:36,  3.34s/it]

./data/clusterdata-2011-2/task_events\part-00117-of-00500.filtered.csv.gz


 59%|███████████████████████████████████████████████▏                                | 118/200 [04:52<04:39,  3.40s/it]

./data/clusterdata-2011-2/task_events\part-00118-of-00500.filtered.csv.gz


 60%|███████████████████████████████████████████████▌                                | 119/200 [04:56<04:48,  3.57s/it]

./data/clusterdata-2011-2/task_events\part-00119-of-00500.filtered.csv.gz


 60%|████████████████████████████████████████████████                                | 120/200 [04:59<04:41,  3.52s/it]

./data/clusterdata-2011-2/task_events\part-00120-of-00500.filtered.csv.gz


 60%|████████████████████████████████████████████████▍                               | 121/200 [05:03<04:38,  3.52s/it]

./data/clusterdata-2011-2/task_events\part-00121-of-00500.filtered.csv.gz


 61%|████████████████████████████████████████████████▊                               | 122/200 [05:06<04:33,  3.51s/it]

./data/clusterdata-2011-2/task_events\part-00122-of-00500.filtered.csv.gz


 62%|█████████████████████████████████████████████████▏                              | 123/200 [05:09<04:28,  3.49s/it]

./data/clusterdata-2011-2/task_events\part-00123-of-00500.filtered.csv.gz


 62%|█████████████████████████████████████████████████▌                              | 124/200 [05:13<04:29,  3.55s/it]

./data/clusterdata-2011-2/task_events\part-00124-of-00500.filtered.csv.gz


 62%|██████████████████████████████████████████████████                              | 125/200 [05:17<04:28,  3.57s/it]

./data/clusterdata-2011-2/task_events\part-00125-of-00500.filtered.csv.gz


 63%|██████████████████████████████████████████████████▍                             | 126/200 [05:20<04:22,  3.54s/it]

./data/clusterdata-2011-2/task_events\part-00126-of-00500.filtered.csv.gz


 64%|██████████████████████████████████████████████████▊                             | 127/200 [05:24<04:15,  3.51s/it]

./data/clusterdata-2011-2/task_events\part-00127-of-00500.filtered.csv.gz


 64%|███████████████████████████████████████████████████▏                            | 128/200 [05:27<04:12,  3.51s/it]

./data/clusterdata-2011-2/task_events\part-00128-of-00500.filtered.csv.gz


 64%|███████████████████████████████████████████████████▌                            | 129/200 [05:31<04:09,  3.51s/it]

./data/clusterdata-2011-2/task_events\part-00129-of-00500.filtered.csv.gz


 65%|████████████████████████████████████████████████████                            | 130/200 [05:34<04:05,  3.51s/it]

./data/clusterdata-2011-2/task_events\part-00130-of-00500.filtered.csv.gz


 66%|████████████████████████████████████████████████████▍                           | 131/200 [05:38<04:00,  3.49s/it]

./data/clusterdata-2011-2/task_events\part-00131-of-00500.filtered.csv.gz


 66%|████████████████████████████████████████████████████▊                           | 132/200 [05:41<03:57,  3.49s/it]

./data/clusterdata-2011-2/task_events\part-00132-of-00500.filtered.csv.gz


 66%|█████████████████████████████████████████████████████▏                          | 133/200 [05:45<03:57,  3.54s/it]

./data/clusterdata-2011-2/task_events\part-00133-of-00500.filtered.csv.gz


 67%|█████████████████████████████████████████████████████▌                          | 134/200 [05:48<03:55,  3.57s/it]

./data/clusterdata-2011-2/task_events\part-00134-of-00500.filtered.csv.gz


 68%|██████████████████████████████████████████████████████                          | 135/200 [05:52<03:52,  3.58s/it]

./data/clusterdata-2011-2/task_events\part-00135-of-00500.filtered.csv.gz


 68%|██████████████████████████████████████████████████████▍                         | 136/200 [05:56<03:53,  3.65s/it]

./data/clusterdata-2011-2/task_events\part-00136-of-00500.filtered.csv.gz


 68%|██████████████████████████████████████████████████████▊                         | 137/200 [06:00<03:51,  3.67s/it]

./data/clusterdata-2011-2/task_events\part-00137-of-00500.filtered.csv.gz


 69%|███████████████████████████████████████████████████████▏                        | 138/200 [06:04<03:53,  3.76s/it]

./data/clusterdata-2011-2/task_events\part-00138-of-00500.filtered.csv.gz


 70%|███████████████████████████████████████████████████████▌                        | 139/200 [06:07<03:47,  3.73s/it]

./data/clusterdata-2011-2/task_events\part-00139-of-00500.filtered.csv.gz


 70%|████████████████████████████████████████████████████████                        | 140/200 [06:11<03:48,  3.81s/it]

./data/clusterdata-2011-2/task_events\part-00140-of-00500.filtered.csv.gz


 70%|████████████████████████████████████████████████████████▍                       | 141/200 [06:15<03:45,  3.83s/it]

./data/clusterdata-2011-2/task_events\part-00141-of-00500.filtered.csv.gz


 71%|████████████████████████████████████████████████████████▊                       | 142/200 [06:19<03:42,  3.83s/it]

./data/clusterdata-2011-2/task_events\part-00142-of-00500.filtered.csv.gz


 72%|█████████████████████████████████████████████████████████▏                      | 143/200 [06:23<03:40,  3.87s/it]

./data/clusterdata-2011-2/task_events\part-00143-of-00500.filtered.csv.gz


 72%|█████████████████████████████████████████████████████████▌                      | 144/200 [06:27<03:34,  3.83s/it]

./data/clusterdata-2011-2/task_events\part-00144-of-00500.filtered.csv.gz


 72%|██████████████████████████████████████████████████████████                      | 145/200 [06:31<03:34,  3.90s/it]

./data/clusterdata-2011-2/task_events\part-00145-of-00500.filtered.csv.gz


 73%|██████████████████████████████████████████████████████████▍                     | 146/200 [06:35<03:34,  3.96s/it]

./data/clusterdata-2011-2/task_events\part-00146-of-00500.filtered.csv.gz


 74%|██████████████████████████████████████████████████████████▊                     | 147/200 [06:39<03:30,  3.97s/it]

./data/clusterdata-2011-2/task_events\part-00147-of-00500.filtered.csv.gz


 74%|███████████████████████████████████████████████████████████▏                    | 148/200 [06:43<03:25,  3.95s/it]

./data/clusterdata-2011-2/task_events\part-00148-of-00500.filtered.csv.gz


 74%|███████████████████████████████████████████████████████████▌                    | 149/200 [06:47<03:22,  3.98s/it]

./data/clusterdata-2011-2/task_events\part-00149-of-00500.filtered.csv.gz


 75%|████████████████████████████████████████████████████████████                    | 150/200 [06:51<03:18,  3.96s/it]

./data/clusterdata-2011-2/task_events\part-00150-of-00500.filtered.csv.gz


 76%|████████████████████████████████████████████████████████████▍                   | 151/200 [06:55<03:16,  4.01s/it]

./data/clusterdata-2011-2/task_events\part-00151-of-00500.filtered.csv.gz


 76%|████████████████████████████████████████████████████████████▊                   | 152/200 [06:59<03:14,  4.05s/it]

./data/clusterdata-2011-2/task_events\part-00152-of-00500.filtered.csv.gz


 76%|█████████████████████████████████████████████████████████████▏                  | 153/200 [07:03<03:09,  4.03s/it]

./data/clusterdata-2011-2/task_events\part-00153-of-00500.filtered.csv.gz


 77%|█████████████████████████████████████████████████████████████▌                  | 154/200 [07:07<03:07,  4.07s/it]

./data/clusterdata-2011-2/task_events\part-00154-of-00500.filtered.csv.gz


 78%|██████████████████████████████████████████████████████████████                  | 155/200 [07:12<03:07,  4.17s/it]

./data/clusterdata-2011-2/task_events\part-00155-of-00500.filtered.csv.gz


 78%|██████████████████████████████████████████████████████████████▍                 | 156/200 [07:16<03:04,  4.20s/it]

./data/clusterdata-2011-2/task_events\part-00156-of-00500.filtered.csv.gz


 78%|██████████████████████████████████████████████████████████████▊                 | 157/200 [07:20<03:03,  4.28s/it]

./data/clusterdata-2011-2/task_events\part-00157-of-00500.filtered.csv.gz


 79%|███████████████████████████████████████████████████████████████▏                | 158/200 [07:25<03:01,  4.31s/it]

./data/clusterdata-2011-2/task_events\part-00158-of-00500.filtered.csv.gz


 80%|███████████████████████████████████████████████████████████████▌                | 159/200 [07:29<02:57,  4.33s/it]

./data/clusterdata-2011-2/task_events\part-00159-of-00500.filtered.csv.gz


 80%|████████████████████████████████████████████████████████████████                | 160/200 [07:34<03:01,  4.53s/it]

./data/clusterdata-2011-2/task_events\part-00160-of-00500.filtered.csv.gz


 80%|████████████████████████████████████████████████████████████████▍               | 161/200 [07:41<03:27,  5.31s/it]

./data/clusterdata-2011-2/task_events\part-00161-of-00500.filtered.csv.gz


 81%|████████████████████████████████████████████████████████████████▊               | 162/200 [07:51<04:17,  6.78s/it]

./data/clusterdata-2011-2/task_events\part-00162-of-00500.filtered.csv.gz


 82%|█████████████████████████████████████████████████████████████████▏              | 163/200 [08:00<04:29,  7.29s/it]

./data/clusterdata-2011-2/task_events\part-00163-of-00500.filtered.csv.gz


 82%|█████████████████████████████████████████████████████████████████▌              | 164/200 [08:06<04:13,  7.03s/it]

./data/clusterdata-2011-2/task_events\part-00164-of-00500.filtered.csv.gz


 82%|██████████████████████████████████████████████████████████████████              | 165/200 [08:12<03:53,  6.67s/it]

./data/clusterdata-2011-2/task_events\part-00165-of-00500.filtered.csv.gz


 83%|██████████████████████████████████████████████████████████████████▍             | 166/200 [08:18<03:37,  6.41s/it]

./data/clusterdata-2011-2/task_events\part-00166-of-00500.filtered.csv.gz


 84%|██████████████████████████████████████████████████████████████████▊             | 167/200 [08:24<03:24,  6.19s/it]

./data/clusterdata-2011-2/task_events\part-00167-of-00500.filtered.csv.gz


 84%|███████████████████████████████████████████████████████████████████▏            | 168/200 [08:30<03:21,  6.28s/it]

./data/clusterdata-2011-2/task_events\part-00168-of-00500.filtered.csv.gz


 84%|███████████████████████████████████████████████████████████████████▌            | 169/200 [08:36<03:09,  6.11s/it]

./data/clusterdata-2011-2/task_events\part-00169-of-00500.filtered.csv.gz


 85%|████████████████████████████████████████████████████████████████████            | 170/200 [08:41<02:57,  5.91s/it]

./data/clusterdata-2011-2/task_events\part-00170-of-00500.filtered.csv.gz


 86%|████████████████████████████████████████████████████████████████████▍           | 171/200 [08:47<02:50,  5.88s/it]

./data/clusterdata-2011-2/task_events\part-00171-of-00500.filtered.csv.gz


 86%|████████████████████████████████████████████████████████████████████▊           | 172/200 [08:54<02:51,  6.12s/it]

./data/clusterdata-2011-2/task_events\part-00172-of-00500.filtered.csv.gz


 86%|█████████████████████████████████████████████████████████████████████▏          | 173/200 [09:00<02:46,  6.16s/it]

./data/clusterdata-2011-2/task_events\part-00173-of-00500.filtered.csv.gz


 87%|█████████████████████████████████████████████████████████████████████▌          | 174/200 [09:06<02:41,  6.21s/it]

./data/clusterdata-2011-2/task_events\part-00174-of-00500.filtered.csv.gz


 88%|██████████████████████████████████████████████████████████████████████          | 175/200 [09:13<02:35,  6.22s/it]

./data/clusterdata-2011-2/task_events\part-00175-of-00500.filtered.csv.gz


 88%|██████████████████████████████████████████████████████████████████████▍         | 176/200 [09:19<02:34,  6.44s/it]

./data/clusterdata-2011-2/task_events\part-00176-of-00500.filtered.csv.gz


 88%|██████████████████████████████████████████████████████████████████████▊         | 177/200 [09:26<02:30,  6.53s/it]

./data/clusterdata-2011-2/task_events\part-00177-of-00500.filtered.csv.gz


 89%|███████████████████████████████████████████████████████████████████████▏        | 178/200 [09:33<02:25,  6.60s/it]

./data/clusterdata-2011-2/task_events\part-00178-of-00500.filtered.csv.gz


 90%|███████████████████████████████████████████████████████████████████████▌        | 179/200 [09:40<02:20,  6.68s/it]

./data/clusterdata-2011-2/task_events\part-00179-of-00500.filtered.csv.gz


 90%|████████████████████████████████████████████████████████████████████████        | 180/200 [09:46<02:10,  6.53s/it]

./data/clusterdata-2011-2/task_events\part-00180-of-00500.filtered.csv.gz


 90%|████████████████████████████████████████████████████████████████████████▍       | 181/200 [09:52<02:02,  6.43s/it]

./data/clusterdata-2011-2/task_events\part-00181-of-00500.filtered.csv.gz


 91%|████████████████████████████████████████████████████████████████████████▊       | 182/200 [09:59<01:55,  6.41s/it]

./data/clusterdata-2011-2/task_events\part-00182-of-00500.filtered.csv.gz


 92%|█████████████████████████████████████████████████████████████████████████▏      | 183/200 [10:05<01:48,  6.37s/it]

./data/clusterdata-2011-2/task_events\part-00183-of-00500.filtered.csv.gz


 92%|█████████████████████████████████████████████████████████████████████████▌      | 184/200 [10:11<01:41,  6.37s/it]

./data/clusterdata-2011-2/task_events\part-00184-of-00500.filtered.csv.gz


 92%|██████████████████████████████████████████████████████████████████████████      | 185/200 [10:18<01:36,  6.46s/it]

./data/clusterdata-2011-2/task_events\part-00185-of-00500.filtered.csv.gz


 93%|██████████████████████████████████████████████████████████████████████████▍     | 186/200 [10:25<01:33,  6.69s/it]

./data/clusterdata-2011-2/task_events\part-00186-of-00500.filtered.csv.gz


 94%|██████████████████████████████████████████████████████████████████████████▊     | 187/200 [10:32<01:27,  6.76s/it]

./data/clusterdata-2011-2/task_events\part-00187-of-00500.filtered.csv.gz


 94%|███████████████████████████████████████████████████████████████████████████▏    | 188/200 [10:38<01:19,  6.65s/it]

./data/clusterdata-2011-2/task_events\part-00188-of-00500.filtered.csv.gz


 94%|███████████████████████████████████████████████████████████████████████████▌    | 189/200 [10:45<01:14,  6.76s/it]

./data/clusterdata-2011-2/task_events\part-00189-of-00500.filtered.csv.gz


 95%|████████████████████████████████████████████████████████████████████████████    | 190/200 [10:52<01:07,  6.79s/it]

./data/clusterdata-2011-2/task_events\part-00190-of-00500.filtered.csv.gz


 96%|████████████████████████████████████████████████████████████████████████████▍   | 191/200 [10:59<01:00,  6.77s/it]

./data/clusterdata-2011-2/task_events\part-00191-of-00500.filtered.csv.gz


 96%|████████████████████████████████████████████████████████████████████████████▊   | 192/200 [11:06<00:53,  6.70s/it]

./data/clusterdata-2011-2/task_events\part-00192-of-00500.filtered.csv.gz


 96%|█████████████████████████████████████████████████████████████████████████████▏  | 193/200 [11:12<00:46,  6.60s/it]

./data/clusterdata-2011-2/task_events\part-00193-of-00500.filtered.csv.gz


 97%|█████████████████████████████████████████████████████████████████████████████▌  | 194/200 [11:18<00:39,  6.55s/it]

./data/clusterdata-2011-2/task_events\part-00194-of-00500.filtered.csv.gz


 98%|██████████████████████████████████████████████████████████████████████████████  | 195/200 [11:25<00:32,  6.51s/it]

./data/clusterdata-2011-2/task_events\part-00195-of-00500.filtered.csv.gz


 98%|██████████████████████████████████████████████████████████████████████████████▍ | 196/200 [11:31<00:26,  6.51s/it]

./data/clusterdata-2011-2/task_events\part-00196-of-00500.filtered.csv.gz


 98%|██████████████████████████████████████████████████████████████████████████████▊ | 197/200 [11:38<00:19,  6.50s/it]

./data/clusterdata-2011-2/task_events\part-00197-of-00500.filtered.csv.gz


 99%|███████████████████████████████████████████████████████████████████████████████▏| 198/200 [11:44<00:13,  6.53s/it]

./data/clusterdata-2011-2/task_events\part-00198-of-00500.filtered.csv.gz


100%|███████████████████████████████████████████████████████████████████████████████▌| 199/200 [11:51<00:06,  6.54s/it]

./data/clusterdata-2011-2/task_events\part-00199-of-00500.filtered.csv.gz
200 files appended
Writing the data to  ./data/clusterdata-2011-2/task_events/appended.h5
Task Completed


In [17]:
final_df[final_df["task_key"] == '1005190908-4']

Unnamed: 0,timestamp,event_type,priority,cpu_req,ram_req,space_req,diff_machine,task_key
1165,0,0,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4
145297,0,1,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4
68874,354150400311,0,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4
68938,354152925043,1,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4
49026,693364066812,0,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4
49034,693365736659,1,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4
49343,703111693103,0,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4
49355,703113545279,1,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4
93397,824243905383,0,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4
93822,824274506192,1,10,0.04376,0.003975,1.9e-05,1.0,1005190908-4


In [4]:
final_df = pd.read_hdf(data_dir+"task_events/appended.h5")

In [5]:
grouped_df = final_df.groupby(by=['task_key']).filter(lambda g: g['event_type'].count() == 2)

In [7]:
del final_df
gc.collect()

3082

In [9]:
keys = grouped_df[grouped_df['event_type']==1]['task_key']

In [10]:
grouped_df = grouped_df[grouped_df["task_key"].isin(keys)]

In [11]:
grouped_df = grouped_df[grouped_df['timestamp'] != 0]

In [12]:
storename = data_dir+"task_events/"+"grouped.h5"
print ("Writing the data to ", storename)

store = pd.HDFStore(storename)
store['df'] = grouped_df
store.close()

print ("Task Completed")

Writing the data to  ./data/clusterdata-2011-2/task_events/grouped.h5
Task Completed


In [None]:
grouped_df = pd.read_hdf(data_dir+"task_events/grouped.h5")

In [13]:
submitted = grouped_df[grouped_df['event_type'] == 0]
finished = grouped_df[grouped_df['event_type'] == 1]

In [14]:
finished_jobs = submitted.merge(right=finished, on=['task_key'], how='inner')

In [15]:
finished_jobs['time_taken'] = finished_jobs['timestamp_y'] - finished_jobs['timestamp_x']

In [21]:
finished_jobs.columns

Index(['timestamp_x', 'priority_x', 'cpu_req_x', 'ram_req_x', 'space_req_x',
       'diff_machine_x', 'task_key', 'time_taken'],
      dtype='object')

In [19]:
finished_jobs.drop(
    [
        'event_type_x', 'timestamp_y', 'event_type_y', 
        'priority_y', 'cpu_req_y', 'ram_req_y', 'space_req_y', 
        'diff_machine_y'
    ],
    axis=1,
    inplace=True
)

In [20]:
finished_jobs['time_taken'] = finished_jobs['time_taken']/1000000

In [22]:
finished_jobs.columns = ['timestamp', 'priority', 'cpu_req', 'ram_req', 'disk_space_req', 'diff_machine', 'task_key', 'time_taken']

In [23]:
store = pd.HDFStore('features.h5')
store['df'] = finished_jobs
store.close()


In [24]:
del finished_jobs
gc.collect()

1563

In [27]:
features['time_taken'] = features["time_taken"]/60

In [28]:
q3, q1 = np.percentile(features.time_taken, [75, 25])

IQR = q3-q1

min_time = q1 - (IQR*1.5)
max_time = q3 + (IQR*1.5)

In [29]:
features = features[(features["time_taken"]>min_time) & (features["time_taken"]<max_time)]

In [31]:
features.shape

(6926993, 8)

In [45]:
pearsonr(features.loc[:10000,'priority'],features.loc[:10000,'time_taken'])

(0.12648419148414072, 6.697983896542973e-37)

## Since we get positive correlation and a p-value of $6.7e-37$, for an $\alpha = 0.05$, we conclude that the correlation is significant

## No. of constraints is positively correlated with job execution time

In [11]:
features = pd.read_hdf('features.h5')

In [14]:
df = pd.read_csv(data_dir+"task_constraints/part-00000-of-00500.csv.gz",header=None)
df.columns = task_attr_header
df['task_key'] = df['job_id'].astype(str) + "-" + df['task_index'].astype(str)
df = df[['task_key','attr_name', 'attr_val', 'comparison']]

In [16]:
for i in tqdm(range(1,500)):
    temp = pd.read_csv(data_dir + "task_constraints/part-%05d-of-00500.csv.gz" % i,header=None)
    temp.columns = task_attr_header
    temp['task_key'] = temp['job_id'].astype(str) + "-" + temp['task_index'].astype(str)
    temp = temp[['task_key','attr_name', 'attr_val', 'comparison']]
    df = pd.concat([df,temp])

100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [21:38<00:00,  4.96s/it]


In [32]:
store = pd.HDFStore('last.h5')
store['df'] = df
store.close()


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['task_key', 'attr_val', 'comparison']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
df = pd.read_hdf('last.h5')

In [34]:
df = df[['task_key','attr_val']]

In [37]:
temp = df.groupby('task_key').agg('count')
temp.reset_index(inplace=True)

In [38]:
temp.head()

Unnamed: 0,task_key,attr_val
0,1005190908-0,17
1,1005190908-1,17
2,1005190908-2,20
3,1005190908-3,17
4,1005190908-4,23


In [39]:
features.head()

Unnamed: 0,timestamp,priority,cpu_req,ram_req,disk_space_req,different_machine,task_key,time_taken
0,604046280,9,0.0625,0.0159,3.815e-06,0.0,6251639646-0,71.205425
1,612141654,0,0.0625,0.004662,7.629e-06,0.0,6251668759-0,50.12482
2,617115446,1,0.03125,0.0159,3.815e-05,0.0,6251668917-0,36.169218
3,619396393,2,0.006248,0.001554,9.537e-07,0.0,6251668761-0,29.633532
4,633214031,2,0.006248,0.003109,9.537e-06,0.0,6251668915-0,54.648233


In [42]:
merged = pd.merge(temp,features,on='task_key')

In [44]:
merged.head()

Unnamed: 0,task_key,attr_val,timestamp,priority,cpu_req,ram_req,disk_space_req,different_machine,time_taken
0,6251632096-0,3,13566340509,9,0.000625,0.004662,1.907e-05,0.0,11485.593187
1,6251660967-0,3,13948667869,2,0.0625,0.0159,0.0001154,0.0,20227.707711
2,6251668761-0,3,619396393,2,0.006248,0.001554,9.537e-07,0.0,29.633532
3,6251668915-0,3,633214031,2,0.006248,0.003109,9.537e-06,0.0,54.648233
4,6251669132-0,3,633230954,2,0.03125,0.01125,7.629e-06,0.0,35.345963


In [46]:
merged.shape

(257939, 9)

In [49]:
merged = merged[['task_key','attr_val','time_taken']]
merged.dropna(inplace=True)
merged.shape

(257939, 3)

In [62]:
pearsonr(merged.loc[:,'attr_val'],merged.loc[:,'time_taken']/60)

(0.052646977425951334, 1.034319389113024e-157)