In [275]:
import pandas as pd
import json

# Show all columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Optionally, adjust the display width for better readability
pd.set_option("display.width", 1000)


In [263]:
workflows_file = 'output_data/workflows_7369d7fb-c356-49cd-893a-e4ceb319e588.json'
tasks_file = 'output_data/tasks_ffcd2234-6424-43c6-9014-4f1e145d977b.parquet'
workflows_df = pd.read_json(workflows_file)
tasks_df = pd.read_parquet(tasks_file)
len(tasks_df)

149

In [265]:
merged_df = pd.merge(tasks_df, workflows_df, on="workflow_id", suffixes=("_task", "_workflow"))
print(merged_df.columns)

Index(['task_id', 'telemetry_at_start', 'status', 'address', 'used_task', 'workflow_id', 'adapter_id_task', 'utc_timestamp_task', 'node_name', 'login_name', 'hostname', 'running', 'generated_task', 'telemetry_at_end', 'started_at', 'ended_at', 'finished', 'submitted_at', 'activity_id', 'submitted', 'ancestor_ids', 'depth', 'subtype', 'group_id', 'parent_task_id', 'custom_characterization', 'custom_metadata_task', '_id', 'campaign_id', 'generated_workflow', 'name', 'type', 'used_workflow', 'custom_metadata_workflow', 'environment_id', 'extra_metadata', 'flowcept_settings', 'flowcept_version', 'interceptor_ids', 'machine_info', 'parent_workflow_id', 'sys_name', 'user', 'utc_timestamp_workflow', 'adapter_id_workflow'], dtype='object')


In [266]:
#dict_fields = {"generated_task", "used_task", "custom_metadata_task"}
dict_fields = {"generated_task", "used_task", "custom_metadata_task", "custom_characterization"}
for dict_field in dict_fields:
    normalized_df = pd.json_normalize(merged_df[dict_field].apply(lambda x: json.loads(x)))
    normalized_df = normalized_df.rename(
        columns={col: f"{dict_field}_{col}" for col in normalized_df.columns}
    )

    merged_df = pd.concat([merged_df.reset_index(drop=True), normalized_df], axis=1)


df = merged_df
print(list(df.columns))

['task_id', 'telemetry_at_start', 'status', 'address', 'used_task', 'workflow_id', 'adapter_id_task', 'utc_timestamp_task', 'node_name', 'login_name', 'hostname', 'running', 'generated_task', 'telemetry_at_end', 'started_at', 'ended_at', 'finished', 'submitted_at', 'activity_id', 'submitted', 'ancestor_ids', 'depth', 'subtype', 'group_id', 'parent_task_id', 'custom_characterization', 'custom_metadata_task', '_id', 'campaign_id', 'generated_workflow', 'name', 'type', 'used_workflow', 'custom_metadata_workflow', 'environment_id', 'extra_metadata', 'flowcept_settings', 'flowcept_version', 'interceptor_ids', 'machine_info', 'parent_workflow_id', 'sys_name', 'user', 'utc_timestamp_workflow', 'adapter_id_workflow', 'generated_task_test_loss', 'generated_task_train_loss', 'generated_task_val_loss', 'generated_task_training_time', 'generated_task_best_obj_id', 'generated_task_tensor.id', 'generated_task_tensor.is_sparse', 'generated_task_tensor.shape', 'generated_task_tensor.device', 'generate

In [296]:
characterization_columns = [c for c in df.columns if c.startswith('custom_characterization')][1:]
tensor_columns = [c for c in df.columns if "task_tensor" in c]

In [299]:
print(characterization_columns)

['custom_characterization_model', 'custom_characterization_train_n_batches', 'custom_characterization_train_batch_size', 'custom_characterization_subset_size', 'custom_characterization_tokenizer_type', 'custom_characterization_workflow_id', 'custom_characterization_name', 'custom_characterization_step', 'custom_characterization_batch', 'custom_characterization_data_path', 'custom_characterization_batch_size', 'custom_characterization_epoch', 'custom_characterization_model_train', 'custom_characterization_module']


In [302]:
reordered_df = df[["activity_id"] + ['custom_characterization_step', 'custom_characterization_epoch', 'custom_characterization_batch'] + tensor_columns + ['task_id']] 
reordered_df.sort_values(['custom_characterization_step', 'custom_characterization_epoch', 'custom_characterization_batch'])

Unnamed: 0,activity_id,custom_characterization_step,custom_characterization_epoch,custom_characterization_batch,generated_task_tensor.id,generated_task_tensor.is_sparse,generated_task_tensor.shape,generated_task_tensor.device,generated_task_tensor.nbytes,generated_task_tensor.numel,generated_task_tensor.density,used_task_tensor.id,used_task_tensor.is_sparse,used_task_tensor.shape,used_task_tensor.device,used_task_tensor.nbytes,used_task_tensor.numel,used_task_tensor.density,task_id
11,TransformerModel,eval,1.0,0.0,4631063000.0,False,"[10, 10, 254]",mps:0,101600.0,25400.0,1.0,5300633000.0,False,"[10, 10]",mps:0,800.0,100.0,0.96,1735619423.024822
12,Embedding,eval,1.0,0.0,5300633000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,5300633000.0,False,"[10, 10]",mps:0,800.0,100.0,0.96,1735619423.0352378
13,PositionalEncoding,eval,1.0,0.0,4710267000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,4634547000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,1735619423.049771
14,TransformerEncoder,eval,1.0,0.0,5300708000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,4710267000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,1735619423.055607
15,Linear,eval,1.0,0.0,4631063000.0,False,"[10, 10, 254]",mps:0,101600.0,25400.0,1.0,5300708000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,1735619423.115511
16,TransformerModel,eval,1.0,1.0,5300709000.0,False,"[10, 10, 254]",mps:0,101600.0,25400.0,1.0,5300709000.0,False,"[10, 10]",mps:0,800.0,100.0,0.92,1735619423.1419961
17,Embedding,eval,1.0,1.0,5300633000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,5300709000.0,False,"[10, 10]",mps:0,800.0,100.0,0.92,1735619423.145136
18,PositionalEncoding,eval,1.0,1.0,5300709000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,5300707000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,1735619423.1483781
19,TransformerEncoder,eval,1.0,1.0,5300707000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,5300709000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,1735619423.151233
20,Linear,eval,1.0,1.0,5300709000.0,False,"[10, 10, 254]",mps:0,101600.0,25400.0,1.0,5300707000.0,False,"[10, 10, 200]",mps:0,80000.0,20000.0,1.0,1735619423.1565409


In [279]:
profile = df[(df.activity_id=='TransformerModel')]['custom_metadata_workflow'].values[0]['model_profile']['modules'].copy()

In [280]:
for key in profile:
    if "first_level_child" in profile[key]:
        int_keys = []
        for in_key in profile[key]:
            if isinstance(profile[key][in_key], int) and not isinstance(profile[key][in_key], bool):
                int_keys.append((in_key,profile[key][in_key]))
                
        print(f"{profile[key]['type']}, {int_keys}")
        print()
        

PositionalEncoding, []

TransformerEncoder, [('num_layers', 2)]

Embedding, [('num_embeddings', 254), ('embedding_dim', 200)]

Linear, [('in_features', 200), ('out_features', 254)]



In [281]:
activities = reordered_df['activity_id'].unique()
for activity in activities:
    if activity not in {"model_train"}:
        #for step in {"train", "test"}:
        _df = reordered_df[(reordered_df["activity_id"] == activity)]
        
        display(_df[['activity_id', 'task_id', 'parent_task_id', 'used_task_tensor.shape', 'used_task_tensor.density']])
        print()

Unnamed: 0,activity_id,task_id,parent_task_id,used_task_tensor.shape,used_task_tensor.density
1,TransformerModel,1735619422.384991,1735619422.3836532,"[20, 20]",0.94
6,TransformerModel,1735619422.8408608,1735619422.838924,"[6, 20]",0.933333
11,TransformerModel,1735619423.024822,1735619423.0235431,"[10, 10]",0.96
16,TransformerModel,1735619423.141996,1735619423.1403918,"[10, 10]",0.92
21,TransformerModel,1735619423.1629992,1735619423.161933,"[10, 10]",0.93
26,TransformerModel,1735619423.183827,1735619423.182692,"[10, 10]",0.96
31,TransformerModel,1735619423.5910249,1735619423.589657,,
36,TransformerModel,1735619423.6211,1735619423.6190772,,
41,TransformerModel,1735619423.775104,1735619423.7189848,,
46,TransformerModel,1735619423.8533518,1735619423.85154,,





Unnamed: 0,activity_id,task_id,parent_task_id,used_task_tensor.shape,used_task_tensor.density
2,Embedding,1735619422.45859,1735619422.384991,"[20, 20]",0.94
7,Embedding,1735619422.851376,1735619422.8408608,"[6, 20]",0.933333
12,Embedding,1735619423.0352378,1735619423.024822,"[10, 10]",0.96
17,Embedding,1735619423.145136,1735619423.141996,"[10, 10]",0.92
22,Embedding,1735619423.166184,1735619423.1629992,"[10, 10]",0.93
27,Embedding,1735619423.184326,1735619423.183827,"[10, 10]",0.96
32,Embedding,1735619423.592518,1735619423.5910249,,
37,Embedding,1735619423.621748,1735619423.6211,,
42,Embedding,1735619423.8383129,1735619423.775104,,
47,Embedding,1735619423.853365,1735619423.8533518,,





Unnamed: 0,activity_id,task_id,parent_task_id,used_task_tensor.shape,used_task_tensor.density
3,PositionalEncoding,1735619422.490431,1735619422.384991,"[20, 20, 200]",1.0
8,PositionalEncoding,1735619422.8681228,1735619422.8408608,"[6, 20, 200]",1.0
13,PositionalEncoding,1735619423.049771,1735619423.024822,"[10, 10, 200]",1.0
18,PositionalEncoding,1735619423.148378,1735619423.141996,"[10, 10, 200]",1.0
23,PositionalEncoding,1735619423.169297,1735619423.1629992,"[10, 10, 200]",1.0
28,PositionalEncoding,1735619423.187089,1735619423.183827,"[10, 10, 200]",1.0
33,PositionalEncoding,1735619423.5941482,1735619423.5910249,,
38,PositionalEncoding,1735619423.623206,1735619423.6211,,
43,PositionalEncoding,1735619423.840831,1735619423.775104,,
48,PositionalEncoding,1735619423.854932,1735619423.8533518,,





Unnamed: 0,activity_id,task_id,parent_task_id,used_task_tensor.shape,used_task_tensor.density
4,TransformerEncoder,1735619422.518063,1735619422.384991,"[20, 20, 200]",0.800625
9,TransformerEncoder,1735619422.885477,1735619422.8408608,"[6, 20, 200]",0.805792
14,TransformerEncoder,1735619423.055607,1735619423.024822,"[10, 10, 200]",1.0
19,TransformerEncoder,1735619423.151233,1735619423.141996,"[10, 10, 200]",1.0
24,TransformerEncoder,1735619423.171925,1735619423.1629992,"[10, 10, 200]",1.0
29,TransformerEncoder,1735619423.189974,1735619423.183827,"[10, 10, 200]",1.0
34,TransformerEncoder,1735619423.595535,1735619423.5910249,,
39,TransformerEncoder,1735619423.6246452,1735619423.6211,,
44,TransformerEncoder,1735619423.842046,1735619423.775104,,
49,TransformerEncoder,1735619423.856357,1735619423.8533518,,





Unnamed: 0,activity_id,task_id,parent_task_id,used_task_tensor.shape,used_task_tensor.density
5,Linear,1735619422.618046,1735619422.384991,"[20, 20, 200]",1.0
10,Linear,1735619422.94244,1735619422.8408608,"[6, 20, 200]",1.0
15,Linear,1735619423.115511,1735619423.024822,"[10, 10, 200]",1.0
20,Linear,1735619423.1565409,1735619423.141996,"[10, 10, 200]",1.0
25,Linear,1735619423.176995,1735619423.1629992,"[10, 10, 200]",1.0
30,Linear,1735619423.1951969,1735619423.183827,"[10, 10, 200]",1.0
35,Linear,1735619423.602164,1735619423.5910249,,
40,Linear,1735619423.630666,1735619423.6211,,
45,Linear,1735619423.847615,1735619423.775104,,
50,Linear,1735619423.8609228,1735619423.8533518,,





In [15]:
len(tasks_df)

158

In [19]:
tasks_df[tasks_df.activity_id == 'train_batch'][['task_id', 'parent_task_id', 'used']]

Unnamed: 0,task_id,parent_task_id,used
6,5094809456,1735502223.258385,
44,5316297360,1735502222.938216,
82,5071481712,1735502222.29404,
120,5078006240,1735502220.758522,


In [22]:
tasks_df[tasks_df.activity_id == 'train_batch_iteration'][['task_id', 'parent_task_id', 'used']]

Unnamed: 0,task_id,parent_task_id,used
32,1735502223.261112,5094809456,"{""i"": 0, ""batch"": [0, 0]}"
33,1735502223.2893682,5094809456,"{""i"": 1, ""batch"": [1, 20]}"
70,1735502222.940667,5316297360,"{""i"": 0, ""batch"": [0, 0]}"
71,1735502222.974849,5316297360,"{""i"": 1, ""batch"": [1, 20]}"
108,1735502222.2968051,5071481712,"{""i"": 0, ""batch"": [0, 0]}"
109,1735502222.349918,5071481712,"{""i"": 1, ""batch"": [1, 20]}"
146,1735502220.760619,5078006240,"{""i"": 0, ""batch"": [0, 0]}"
147,1735502221.447211,5078006240,"{""i"": 1, ""batch"": [1, 20]}"


In [32]:
tasks_df[tasks_df.activity_id == 'TransformerModel'][['task_id', 'parent_task_id', 'used']]

Unnamed: 0,task_id,parent_task_id,used
12,1735502223.527418,1735502223.525253,
17,1735502223.3520389,1735502223.348997,
22,1735502223.337213,1735502223.3353548,
27,1735502223.321942,1735502223.320519,
34,1735502223.291469,1735502223.2893682,
39,1735502223.263368,1735502223.261112,
50,1735502223.119127,1735502223.116868,
55,1735502223.105198,1735502223.103198,
60,1735502223.0897129,1735502223.0878289,
65,1735502223.007659,1735502223.00581,


In [47]:
tasks_df[tasks_df.activity_id=='model_train']['used'][0]

'{"batch_size": 20, "eval_batch_size": 10, "emsize": 200, "nhid": 200, "nlayers": 2, "nhead": 2, "dropout": 0.2, "epochs": 4, "lr": 0.1, "pos_encoding_max_len": 5000, "ntokens": 254, "train_data_path": "/Users/rsr/Documents/GDrive/ORNL/dev/flowcept/examples/llm_complex/input_data/train_data.tensor", "val_data_path": "/Users/rsr/Documents/GDrive/ORNL/dev/flowcept/examples/llm_complex/input_data/val_data.tensor", "test_data_path": "/Users/rsr/Documents/GDrive/ORNL/dev/flowcept/examples/llm_complex/input_data/test_data.tensor", "campaign_id": "4831fd04-d480-41c5-ad07-362f963e0b1c"}'

In [None]:
# semantic_id_val(batch) = {
#     **dataprep_workflow.used,
#     workflow__generate_wikitext_dataset.generated.train_data_path,
#     train_batch_iteration.batch,
#     dataprep_workflow.generated.train_n_batches,
# }

In [None]:
#semantic_id(epoch_task_id) = epoch_task.epoch+search_task_id

In [34]:
#semantic_id(train_batch_iteration) = {
#     "workflow__generate_wikitext_dataset": {workflow_id, **dataprep_workflow.used},
#     "train_data_path": workflow__generate_wikitext_dataset.generated.train_data_path
#     "batch": train_batch_iteration.batch,
#     "step": train_batch_iteration.parent_epoch.step,
#     **{semantic_id(train_batch_iteration.parent_task_id)}
# } | { workflow__generate_wikitext_dataset = workflow, where workflow.generated.train_data_path == train_batch_iteration.parent_epoch.parent_search_task.used.train_data_path }


In [None]:
#semantic_id(parent_forward) = parent_forward.activity_id+semantic_id(parent_forward.parent_task_id)

In [None]:
#semantic_id(child_forward) = child_forward.activity_id+semantic_id(child_forward.parent_task_id) 

In [59]:
tasks_df.activity_id.unique()

array(['model_train', 'epochs_loop_iteration', 'train_batch_iteration',
       'eval_batch_iteration', 'TransformerModel', 'Embedding',
       'PositionalEncoding', 'TransformerEncoder', 'Linear'], dtype=object)

In [254]:
json.loads(tasks_df.ancestor_ids[40])

[{'model_train': 'model_train-35ede61e154e240c9d1a8802bb37dd22'},
 {'epochs_loop_iteration': '1735613700.0566132'},
 {'eval_batch_iteration': '1735613700.8849669'},
 {'TransformerModel': '1735613700.886549'}]

In [164]:
from llm_model import get_batch
from llm_dataprep import yield_tokens

In [255]:
prov = json.loads(tasks_df.custom_characterization[40])
prov

[{'module': 'Linear'},
 {'model': 'TransformerModel'},
 {'step': 'eval,',
  'batch': 3,
  'data_path': '/Users/rsr/Documents/GDrive/ORNL/dev/flowcept/examples/llm_complex/input_data/val_data.tensor',
  'batch_size': 10},
 {'epoch': 1},
 {'model_train': 'model_train-35ede61e154e240c9d1a8802bb37dd22'}]

In [239]:
import torch
processed_dataset = torch.load(prov[2]['data_path'])
batch_i = prov[2]['batch']
batch_size = prov[2]['batch_size']

TypeError: list indices must be integers or slices, not str

In [213]:
val_mapping = torch.load('/Users/rsr/Documents/GDrive/ORNL/dev/flowcept/examples/llm_complex/input_data/val_data_mapping.tensor')

In [130]:
batch_X, batch_y = get_batch(processed_dataset, batch_i, batch_size)

In [155]:
len(processed_dataset)

41

In [237]:
import os
from torch.utils.data import Subset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from datasets import load_dataset, load_from_disk
from llm_dataprep import get_raw_batch

ImportError: cannot import name 'get_raw_batch' from 'llm_dataprep' (/Users/rsr/Documents/GDrive/ORNL/dev/flowcept/examples/llm_complex/llm_dataprep.py)

In [216]:
def get_raw_batch(raw_dataset, mapping, i, batch_size):
    start_idx = i
    end_idx = i + batch_size
    raw_indices = mapping[start_idx:end_idx]  # Indices of raw items in this batch
    raw_batch = [raw_dataset[idx]["text"] for idx in set(raw_indices)]  # Unique raw items
    return raw_batch



In [109]:
dataset_path = os.path.join('input_data', "wikitext-2-v1.data")

In [112]:
raw_dataset = load_dataset("wikitext", "wikitext-2-v1")

In [224]:
raw_validation_dataset = raw_dataset["validation"]
subset_size = 10
raw_validation_dataset = Subset(raw_validation_dataset, range(subset_size))

In [225]:
d = get_raw_batch(raw_validation_dataset, val_mapping, batch_i, batch_size)

In [226]:
d[1]

' Homarus gammarus , known as the European lobster or common lobster , is a species of <unk> lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into <unk> larvae . Homarus gammarus is a highly esteemed food , and is widely caught using lobster pots , mostly around the British Isles . \n'

In [158]:
len(raw_validation_dataset)

10

In [208]:
def get_raw_batch(source, i, batch_size=35):
    seq_len = min(batch_size, len(source) - 1 - i)
    raw_data = list(source)
    print(i , i + seq_len)
    data = raw_data[i : i + seq_len]
    return data

In [212]:
raw_batch = get_raw_batch(raw_validation_dataset, i=1, batch_size=10)

1 9


In [205]:
batch_size

10

In [198]:
len(raw_validation_dataset)

10

In [199]:
seq_len = min(batch_size, len(raw_validation_dataset) - 1 - batch_i)

In [202]:
batch_i,seq_len

(3, 6)

In [192]:
batch_size

10

In [188]:
batch_i

3

In [203]:
raw_batch

[{'text': ' Homarus gammarus , known as the European lobster or common lobster , is a species of <unk> lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into <unk> larvae . Homarus gammarus is a highly esteemed food , and is widely caught using lobster pots , mostly around the British Isles . \n'},
 {'text': ''},
 {'text': ' = = Description = = \n'},
 {'text': ''},
 {'text': ' Homarus gammarus is a large <unk> , with a body length up to 60 centimetres ( 24 in ) and weighing up to 5 – 6 kilograms ( 11 – 13 lb ) , although the lobsters caught in lobster pots are usually 23 – 38 cm ( 

In [131]:
print(len(batch_X), len(batch_y))

10 100


In [132]:
print(len(raw_batch_X),len(raw_batch_y))

1 1


In [135]:
raw_batch_X['text']

[' Homarus gammarus , known as the European lobster or common lobster , is a species of <unk> lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into <unk> larvae . Homarus gammarus is a highly esteemed food , and is widely caught using lobster pots , mostly around the British Isles . \n',
 '',
 ' = = Description = = \n',
 '',
 ' Homarus gammarus is a large <unk> , with a body length up to 60 centimetres ( 24 in ) and weighing up to 5 – 6 kilograms ( 11 – 13 lb ) , although the lobsters caught in lobster pots are usually 23 – 38 cm ( 9 – 15 in ) long and weigh 0 @.@ 7 – 2 @.@ 2 kg (

In [169]:
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(yield_tokens(tokenizer, raw_dataset["train"]))
vocab.set_default_index(vocab["<unk>"])

In [172]:
vocab

Vocab()