# Import Packages

In [1]:
import json
import os
import pandas as pd
from datetime import datetime 

# Example JSON Load

In [7]:
sample = {
    "col1": {
        "Layer2":[{
            "Layer3"
        }]
    }, 
    "col2": "Hello"
}

In [8]:
pd.DataFrame.from_dict(sample)

Unnamed: 0,col1,col2
Layer2,[{Layer3}],Hello


# Load Data

In [10]:
data_folder = "raw_data/"
output_dir = "output_data/"
file = "response1.json"
raw_json = None

#Handy for making an output dir inline if needed. 
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

#I always read JSON in as JSON first and then parse. 
with open(data_folder + file, "r+") as f:
    raw_json = json.load(f)
raw_json

{'logs': [{'_id': 'Bash:cb24ee90-447c-4119-9973-6c2305e12abe',
   '_rev': '292-261a171d16df8e4785c061f2f1ca4dca',
   'course_id': 'Cyverse-RNA-Tutorial',
   'created_timestamp': '2022-09-29 19:20:08.685273',
   'log': {'commands': [{'command': 'ls',
      'file': '',
      'filename': '',
      'full_command': 'ls',
      'stderr': '',
      'stdout': 'data\nhome\nstderr.txt\nSuperShell',
      'time': '2022-09-29:19:19'},
     {'command': 'ls',
      'file': '',
      'filename': '.',
      'full_command': 'ls .',
      'stderr': '',
      'stdout': 'data\nhome\nstderr.txt\nSuperShell',
      'time': '2022-09-29:19:20'},
     {'command': 'clear',
      'file': '',
      'filename': '',
      'full_command': 'clear',
      'stderr': '',
      'stdout': '\x1b[H\x1b[2J\x1b[3J',
      'time': '2022-09-29:19:20'},
     {'command': 'cd',
      'file': '',
      'filename': '',
      'full_command': 'cd Bash',
      'stderr': '/home/jovyan/.offline.sh: line 354: cd: Bash: No such file or dir

In [19]:
#Iterate and load all files in a folder
loaded = []
for i in os.listdir(data_folder):
    with open(data_folder + i, "r+") as f:
        load = json.load(f)
        loaded.append(load)
loaded

[{'logs': [{'_id': 'Bash:cb24ee90-447c-4119-9973-6c2305e12abe',
    '_rev': '292-261a171d16df8e4785c061f2f1ca4dca',
    'course_id': 'Cyverse-RNA-Tutorial',
    'created_timestamp': '2022-09-29 19:20:08.685273',
    'log': {'commands': [{'command': 'ls',
       'file': '',
       'filename': '',
       'full_command': 'ls',
       'stderr': '',
       'stdout': 'data\nhome\nstderr.txt\nSuperShell',
       'time': '2022-09-29:19:19'},
      {'command': 'ls',
       'file': '',
       'filename': '.',
       'full_command': 'ls .',
       'stderr': '',
       'stdout': 'data\nhome\nstderr.txt\nSuperShell',
       'time': '2022-09-29:19:20'},
      {'command': 'clear',
       'file': '',
       'filename': '',
       'full_command': 'clear',
       'stderr': '',
       'stdout': '\x1b[H\x1b[2J\x1b[3J',
       'time': '2022-09-29:19:20'},
      {'command': 'cd',
       'file': '',
       'filename': '',
       'full_command': 'cd Bash',
       'stderr': '/home/jovyan/.offline.sh: line 354:

# Process Data 

## Build Command Records - Array of JSON Objects

In [21]:
what_it_expects = [{"Animal": "Cat", "Size": "Medium"}, {"Animal": "Dog", "Size": "Large"}]
pd.DataFrame.from_records(what_it_expects)

Unnamed: 0,Animal,Size
0,Cat,Medium
1,Dog,Large


In [29]:
raw_records = []
log_list = raw_json['logs']
for i in log_list:
    #Variables to keep at the highest level
    log_id = i['log_id']
    course = i['course_id']
    log_type = i['log_type']
    machine_id = i['machine_id']
    
    #Base unit is the series of commands. 
    for event in i['log']['commands']:
        #Build the common columns for all events
        to_add = {
            "Log ID": log_id, 
            "Course": course, 
            "Log Type": log_type,
        }
        
        #add to dict with equal
        to_add['Machine ID'] = machine_id
        
        #Add each command to array and update 
        to_add.update(event)
        
        #Sometimes processing type prior to building frame is easiest
        #For example, pandas can understand datetime, but right now it is a string. 
        #Convert the time into a string using this datetime function.
        to_add['time'] = datetime.strptime(to_add['time'], "%Y-%m-%d:%H:%M")
        
        raw_records.append(to_add)

#Look at the records we have made
raw_records[0:1]

[{'Log ID': '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  'Course': 'Cyverse-RNA-Tutorial',
  'Log Type': 'Bash',
  'Machine ID': 'a9321',
  'command': 'ls',
  'file': '',
  'filename': '',
  'full_command': 'ls',
  'stderr': '',
  'stdout': 'data\nhome\nstderr.txt\nSuperShell',
  'time': datetime.datetime(2022, 9, 29, 19, 19)}]

In [31]:
## Convert to Pandas Dataframe 
pd_records = pd.DataFrame.from_records(raw_records)
pd_records.head()

Unnamed: 0,Log ID,Course,Log Type,Machine ID,command,file,filename,full_command,stderr,stdout,time
0,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,a9321,ls,,,ls,,data\nhome\nstderr.txt\nSuperShell,2022-09-29 19:19:00
1,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,a9321,ls,,.,ls .,,data\nhome\nstderr.txt\nSuperShell,2022-09-29 19:20:00
2,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,a9321,clear,,,clear,,[H[2J[3J,2022-09-29 19:20:00
3,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,a9321,cd,,,cd Bash,/home/jovyan/.offline.sh: line 354: cd: Bash: ...,,2022-09-29 19:24:00
4,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,a9321,cd,,,cd home,,,2022-09-29 19:24:00


In [33]:
print(max(pd_records['time']))
pd_records.info()

2022-10-14 00:23:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Log ID        27 non-null     object        
 1   Course        27 non-null     object        
 2   Log Type      27 non-null     object        
 3   Machine ID    27 non-null     object        
 4   command       27 non-null     object        
 5   file          27 non-null     object        
 6   filename      27 non-null     object        
 7   full_command  27 non-null     object        
 8   stderr        27 non-null     object        
 9   stdout        27 non-null     object        
 10  time          27 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(10)
memory usage: 2.4+ KB


## Build Command Records - JSON of Arrays 

In [35]:
high_level_json = {
    "Log ID": [],
    "Course": [],
    "Log Type": [], 
    "command": [],
    "file": [],
    "filename": [], 
    "full_command": [],
    "stderr": [],
    "stdout": [],
    "time": []
}
log_list = raw_json['logs']
for i in log_list:
    #Variables to keep at the highest level
    log_id = i['log_id']
    course = i['course_id']
    log_type = i['log_type']
    
    #Base unit is the series of commands. 
    for event in i['log']['commands']:
        #Build the common columns for all events
        high_level_json['Log ID'].append(log_id)
        high_level_json['Course'].append(course)
        high_level_json['Log Type'].append(log_type)
        
        for i in event: 
            high_level_json[i].append(event[i])
        
        #Note it is a little more difficult to convert these strings here.
        #You could do an if in the for i in event statement 
        #We will leverage Pandas datetime function to do this for us. 
        

#Look at the JSON
high_level_json

{'Log ID': ['6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353411be02043e60902e7',
  '3a15d2c

In [36]:
## Convert to Dataframe 
pd_from_json = pd.DataFrame.from_dict(high_level_json)
pd_from_json.head()

Unnamed: 0,Log ID,Course,Log Type,command,file,filename,full_command,stderr,stdout,time
0,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,ls,,,ls,,data\nhome\nstderr.txt\nSuperShell,2022-09-29:19:19
1,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,ls,,.,ls .,,data\nhome\nstderr.txt\nSuperShell,2022-09-29:19:20
2,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,clear,,,clear,,[H[2J[3J,2022-09-29:19:20
3,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,cd,,,cd Bash,/home/jovyan/.offline.sh: line 354: cd: Bash: ...,,2022-09-29:19:24
4,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,cd,,,cd home,,,2022-09-29:19:24


In [37]:
## Notice Types 
pd_from_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Log ID        27 non-null     object
 1   Course        27 non-null     object
 2   Log Type      27 non-null     object
 3   command       27 non-null     object
 4   file          27 non-null     object
 5   filename      27 non-null     object
 6   full_command  27 non-null     object
 7   stderr        27 non-null     object
 8   stdout        27 non-null     object
 9   time          27 non-null     object
dtypes: object(10)
memory usage: 2.2+ KB


In [40]:
## Time is an object, we want these to be datetimes, can do this in pandas 
pd_from_json['Timestamp'] = pd.to_datetime(pd_from_json['time'], format="%Y-%m-%d:%H:%M")
print(max(pd_from_json['time']))
print(max(pd_from_json['Timestamp']))
pd_from_json.info()

2022-10-14:00:23
2022-10-14 00:23:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Log ID        27 non-null     object        
 1   Course        27 non-null     object        
 2   Log Type      27 non-null     object        
 3   command       27 non-null     object        
 4   file          27 non-null     object        
 5   filename      27 non-null     object        
 6   full_command  27 non-null     object        
 7   stderr        27 non-null     object        
 8   stdout        27 non-null     object        
 9   time          27 non-null     object        
 10  Timestamp     27 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(10)
memory usage: 2.4+ KB


## Build Command Records - DataFrame Hybrid 

In [58]:
base_frame = pd.DataFrame()
log_list = raw_json['logs']
for i in log_list:
    #Variables to keep at the highest level
    log_id = i['log_id']
    course = i['course_id']
    log_type = i['log_type']
    
    #Base unit is the series of commands. 
    for event in i['log']['commands']:
        
        #Build a dataframe for the smallest unit
        subframe = pd.DataFrame.from_records([event])
        
        #Add common fields to subframe 
        subframe['Log ID'] = log_id
        subframe['Course'] = course
        subframe['Log Type'] = log_type
        
        if base_frame.shape[0] == 0:
            base_frame = subframe
            # breaks out of for loop 
            # a nifty command
            continue 
            
        base_frame = pd.concat([base_frame, subframe])
        

base_frame.head()

Unnamed: 0,command,file,filename,full_command,stderr,stdout,time,Log ID,Course,Log Type
0,ls,,,ls,,data\nhome\nstderr.txt\nSuperShell,2022-09-29:19:19,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash
0,ls,,.,ls .,,data\nhome\nstderr.txt\nSuperShell,2022-09-29:19:20,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash
0,clear,,,clear,,[H[2J[3J,2022-09-29:19:20,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash
0,cd,,,cd Bash,/home/jovyan/.offline.sh: line 354: cd: Bash: ...,,2022-09-29:19:24,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash
0,cd,,,cd home,,,2022-09-29:19:24,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash


In [59]:
## Clean up again 
## Time is an object, we want these to be datetimes, can do this in pandas 
base_frame['Timestamp'] = pd.to_datetime(base_frame['time'], format="%Y-%m-%d:%H:%M")
base_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 0 to 0
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   command       27 non-null     object        
 1   file          27 non-null     object        
 2   filename      27 non-null     object        
 3   full_command  27 non-null     object        
 4   stderr        27 non-null     object        
 5   stdout        27 non-null     object        
 6   time          27 non-null     object        
 7   Log ID        27 non-null     object        
 8   Course        27 non-null     object        
 9   Log Type      27 non-null     object        
 10  Timestamp     27 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(10)
memory usage: 2.5+ KB


# Export Dataframe

In [68]:
#This will allow you to manipulate what you export and not effect pd_records
to_export = pd_records.copy()

#Some column manipulation can be useful 
column_list = list(to_export.columns)

#You can reorder like so 
column_list.remove('time')
column_list = ["time"] + column_list
print(column_list)

#Rearrange columns, time is now the first column
to_export = to_export[column_list]
to_export.head()

['time', 'Log ID', 'Course', 'Log Type', 'command', 'file', 'filename', 'full_command', 'stderr', 'stdout']


Unnamed: 0,time,Log ID,Course,Log Type,command,file,filename,full_command,stderr,stdout
0,2022-09-29 19:19:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,ls,,,ls,,data\nhome\nstderr.txt\nSuperShell
1,2022-09-29 19:20:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,ls,,.,ls .,,data\nhome\nstderr.txt\nSuperShell
2,2022-09-29 19:20:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,clear,,,clear,,[H[2J[3J
3,2022-09-29 19:24:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,cd,,,cd Bash,/home/jovyan/.offline.sh: line 354: cd: Bash: ...,
4,2022-09-29 19:24:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,cd,,,cd home,,


In [69]:
# Make sure to add the suffix to the file name. You can also give path 
to_export.to_csv(output_dir + "response1.csv")

In [70]:
# If you are going to import to pandas later, I suggest pickle file 
# This will preserve types such as datetimes 
to_export.to_pickle(output_dir + "response1.pki")

# Read Back to Check 

In [71]:
reloaded = pd.read_pickle(output_dir + "response1.pki")
reloaded.head()

Unnamed: 0,time,Log ID,Course,Log Type,command,file,filename,full_command,stderr,stdout
0,2022-09-29 19:19:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,ls,,,ls,,data\nhome\nstderr.txt\nSuperShell
1,2022-09-29 19:20:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,ls,,.,ls .,,data\nhome\nstderr.txt\nSuperShell
2,2022-09-29 19:20:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,clear,,,clear,,[H[2J[3J
3,2022-09-29 19:24:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,cd,,,cd Bash,/home/jovyan/.offline.sh: line 354: cd: Bash: ...,
4,2022-09-29 19:24:00,6c940b1fbdd022d6db2d5840dfd7207c30a060b05b4353...,Cyverse-RNA-Tutorial,Bash,cd,,,cd home,,


In [72]:
reloaded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   time          27 non-null     datetime64[ns]
 1   Log ID        27 non-null     object        
 2   Course        27 non-null     object        
 3   Log Type      27 non-null     object        
 4   command       27 non-null     object        
 5   file          27 non-null     object        
 6   filename      27 non-null     object        
 7   full_command  27 non-null     object        
 8   stderr        27 non-null     object        
 9   stdout        27 non-null     object        
dtypes: datetime64[ns](1), object(9)
memory usage: 2.2+ KB
