In [51]:
import os
import numpy as np
import pandas as pd
from pandas._libs.tslibs import dtypes, timestamps
import subprocess
from functools import reduce
import matplotlib
matplotlib.use('Qt5Agg')
import matplotlib.pyplot as plt
# Using matplotlib backend: QtAgg

In [52]:
def printRow(row):
    print("TIME:", row['system_time'])
    print(row['action'])
    print("==============")

system = "slurm"
model = "alexnet"
n_nodes = "4"
n_epochs = "2"
batch_size = "64"
save_every = "1"
log = "true"

if system == "aurora01":
    stat_dir = "/home/gsd/andrelucena/statistics"
elif system == "slurm":
    stat_dir = "/projects/a97485/statistics"

stat_test = "eBPFs_subset"
if system == "aurora01":
    test_name = model + "_" + n_epochs + "_" + batch_size + "_" + save_every + "_" + log
elif system == "slurm":
    test_name = model + "_" + n_nodes + "_" + n_epochs + "_" + batch_size + "_" + save_every + "_" + log

full_test_path = stat_dir + "/" + stat_test + "/" + test_name
if system == "slurm":
    full_test_path += "/aurora03"

p = subprocess.Popen(["scp", f"{system}:{full_test_path}/out.out", "./file"])
sts = os.waitpid(p.pid, 0)

df = pd.read_csv("./file", skiprows=[0,1,2,3,4], names=['system_time'], index_col=False, delimiter='/')

os.remove("./file")

df = df['system_time'].str.extract(r'[\t\s]*(?P<system_time>[^\t:]*:[^:]*:[^:]*):(?P<action>.*)')
df = df.dropna()
print(df.head())
df['system_time'] = pd.to_datetime(df['system_time'], format='ISO8601')
df['duration'] = df['system_time'].shift(-1) - df['system_time']

                  system_time                                    action
0  2024-11-04 14:53:56.121752                            Training begin
1  2024-11-04 14:53:56.122318                          Training epoch 1
3  2024-11-04 14:53:56.774763                Start Training Iteration 0
4  2024-11-04 14:53:56.775553   Moving data to the same device as model
5  2024-11-04 14:53:56.776301                          Computing output


In [53]:
df.dtypes

system_time     datetime64[ns]
action                  object
duration       timedelta64[ns]
dtype: object

In [54]:
print(df['action'].unique())
df['action'] = df['action'].replace([r'Training epoch.*', r'Trained epoch.*', r'Epoch \d \| Saving.*', r'Epoch \d \| Checkpoint.*', r'Ended.*', r'Start.*'], 
                                    ['Training Epoch', 'Trained Epoch', 'Saving Checkpoint', 'Saved Checkpoint', 'Ended Training Iteration', 'Start Training Iteration'], 
                                    regex=True)
df = df[~df['action'].str.contains('Accuracy')]
df['action'] = df['action'].astype(str)
print(df['action'].value_counts())

[' Training begin' ' Training epoch 1' ' Start Training Iteration 0' ...
 " Accuracy top1: tensor([0.3707], device='cuda:0'); Accuracy top5: tensor([2.0060], device='cuda:0')"
 ' Epoch 2 | Saving checkpoint at checkpoint.pt'
 ' Epoch 2 | Checkpoint saved at checkpoint.pt']
action
Moving data to the same device as model    2504
Start Training Iteration                   2504
Computing Loss                             2504
Computing output                           2504
Ended Training Iteration                   2504
SGD step                                   2504
Compute gradient                           2504
Training Epoch                                2
Saving Checkpoint                             2
Trained Epoch                                 2
Saved Checkpoint                              2
Training begin                                1
Name: count, dtype: int64


In [55]:
def concat(series):
    return reduce(lambda x, y: x + "\n" + y, series)

print(df)

#df = df.groupby('system_time', as_index=False).agg({'action':concat})

df.head()

for index, row in df.iterrows():
    printRow(row)

                     system_time                                    action  \
0     2024-11-04 14:53:56.121752                            Training begin   
1     2024-11-04 14:53:56.122318                            Training Epoch   
3     2024-11-04 14:53:56.774763                  Start Training Iteration   
4     2024-11-04 14:53:56.775553   Moving data to the same device as model   
5     2024-11-04 14:53:56.776301                          Computing output   
...                          ...                                       ...   
17535 2024-11-04 15:18:34.414500                                  SGD step   
17536 2024-11-04 15:18:34.415726                  Ended Training Iteration   
17537 2024-11-04 15:18:34.473415                             Trained Epoch   
17539 2024-11-04 15:18:35.844379                         Saving Checkpoint   
17540 2024-11-04 15:18:37.049690                          Saved Checkpoint   

                    duration  
0     0 days 00:00:00.000566  
1

In [56]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 17537 entries, 0 to 17540
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype          
---  ------       --------------  -----          
 0   system_time  17537 non-null  datetime64[ns] 
 1   action       17537 non-null  object         
 2   duration     17536 non-null  timedelta64[ns]
dtypes: datetime64[ns](1), object(1), timedelta64[ns](1)
memory usage: 548.0+ KB
None
                 system_time                                    action  \
0 2024-11-04 14:53:56.121752                            Training begin   
1 2024-11-04 14:53:56.122318                            Training Epoch   
3 2024-11-04 14:53:56.774763                  Start Training Iteration   
4 2024-11-04 14:53:56.775553   Moving data to the same device as model   
5 2024-11-04 14:53:56.776301                          Computing output   

                duration  
0 0 days 00:00:00.000566  
1 0 days 00:00:00.652445  
3 0 days 00:00:00.000790  
4 0 days

In [57]:
def plotAction(df_param: pd.DataFrame):

    dt = df_param

    # plots action with time
    plt.figure()
    plt.plot(dt['system_time'], dt['action'], linestyle='None', markersize = 10.0, marker = ".")
    plt.xlabel('Time')
    plt.ylabel('Action')
    plt.title('Action per time')
    # plt.legend()
    plt.show()

In [58]:
plotAction(df)

In [59]:
elapsed_time = df.groupby('action', as_index=False).agg({'duration':'mean'}).sort_values('duration', ascending=False)
print(elapsed_time)
plt.figure()
plt.bar(elapsed_time['action'], pd.to_datetime(elapsed_time['duration'], unit='ns'))
plt.xlabel('Action')
plt.ylabel('Total Time Spent')
plt.title('Average Time per Action')
plt.show()

                                      action                  duration
7                          Saving Checkpoint 0 days 00:00:01.182253500
10                            Training Epoch 0 days 00:00:00.576676500
0                           Compute gradient 0 days 00:00:00.531880249
2                           Computing output 0 days 00:00:00.030375014
1                             Computing Loss 0 days 00:00:00.018066396
5                                   SGD step 0 days 00:00:00.006487561
9                              Trained Epoch 0 days 00:00:00.000943500
3                   Ended Training Iteration 0 days 00:00:00.000936161
4    Moving data to the same device as model 0 days 00:00:00.000860315
8                   Start Training Iteration 0 days 00:00:00.000808331
6                           Saved Checkpoint    0 days 00:00:00.000636
11                            Training begin    0 days 00:00:00.000566
