In [1]:
import json
import pandas as pd
from datetime import date


def read_log(log_path):
    """
    Opens the log file and transforms json to pandas dataframe
    and persisting data in a csv
    
    Args:
        log_path (str): path of input file
        
    Returns
        df : pandas dataframe
    """
    pair_list = []
    count = 0
    number_of_threads = 10
    list_job_run_df = []
    
    with open(log_path, 'r') as f:
        Lines = f.readlines()
    
    for line in Lines:
        line_json = json.loads(line)
        if "pair" in line_json:
            pair_list.append(line_json)
        else:
            count += 1
            pair_list.append(line_json)
            if count == 20:
                # covert line_json to df
                job_run_df = pd.DataFrame.from_records(pair_list)
                # add df to list_of_job_df
                list_job_run_df.append(job_run_df)
                # clear pair_list for next run
                pair_list = []
                count = 0
                
    # previous version df_pair_list
    # return len(list_job_run_df)
    return list_job_run_df


def calculate_thread_runtime(df_pair_list):
    """
    Calculates the runtime for thread grouped by MetacacheTread number
    
    Args:
        df_pairs_list (Dataframe) - df_pairs_list per run
        
    Returns
        df :  dataframe with original columns with calculated run_times
    """
    
    list_of_df = []

    for i in range(10):
        thread = f'MetacacheThread-{i}'
        a = df_pair_list[df_pair_list['thread']== thread]
        a['timestamp'] = pd.to_datetime(a['timestamp'], infer_datetime_format=True)
        a['run_time']=a['timestamp'].diff()
        a.loc[:, 'run_time'] = a.run_time.shift(-1)
        list_of_df.append(a)

    big_df = pd.concat(list_of_df)

    return big_df


def process_each_job(list_job_run_df):
    """
    iterates through list_job_run_df 
    - calculates runtime
    - adds column for run number

    Args:
        list_job_run_df (list(dataframes))

    Return
        dataframe with additional column denoting run number    
    """
    list_processed_df = []
    # iterate thru list_job_run_df
    for i in range(len(list_job_run_df)):
        # run calculate_thread_runtime(df)
        list_of_df = calculate_thread_runtime(list_job_run_df[i])
        # add column to df['run_number'] = i
        list_of_df['run_number'] = i
        list_processed_df.append(list_of_df)
    # combine all df
    all_df = pd.concat(list_processed_df)

    return all_df

In [2]:
list_job_run_df = read_log('input/metadata_update-extension.log')



In [5]:
# testing this!
all_df = process_each_job(list_job_run_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['timestamp'] = pd.to_datetime(a['timestamp'], infer_datetime_format=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['run_time']=a['timestamp'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a.loc[:, 'run_time'] = a.run_time.shift(-1)
  a.loc[:, 'run_time'] = a.run_time.shift(-1)
A v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['run_time']=a['timestamp'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a.loc[:, 'run_time'] = a.run_time.shift(-1)
  a.loc[:, 'run_time'] = a.run_time.shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['timestamp'] = pd.to_datetime(a['timestamp'], infer_datetime_format=True)
A v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['timestamp'] = pd.to_datetime(a['timestamp'], infer_datetime_format=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['run_time']=a['timestamp'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a.loc[:, 'run_time'] = a.run_time.shift(-1)
  a.loc[:, 'run_time'] = a.run_time.shift(-1)
A v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['timestamp'] = pd.to_datetime(a['timestamp'], infer_datetime_format=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['run_time']=a['timestamp'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a.loc[:, 'run_time'] = a.run_time.shift(-1)
  a.loc[:, 'run_time'] = a.run_time.shift(-1)
A v

In [8]:
# testing this!
print(all_df[all_df['run_number']==2].head(20))

                               event                  logger level  \
5   no jobs left, thread terminating  compute_covidcast_meta  info   
13                   starting thread  compute_covidcast_meta  info   
19                     starting pair  compute_covidcast_meta  info   
4   no jobs left, thread terminating  compute_covidcast_meta  info   
14                   starting thread  compute_covidcast_meta  info   
15                   starting thread  compute_covidcast_meta  info   
21                     starting pair  compute_covidcast_meta  info   
1   no jobs left, thread terminating  compute_covidcast_meta  info   
16                   starting thread  compute_covidcast_meta  info   
20                     starting pair  compute_covidcast_meta  info   
6   no jobs left, thread terminating  compute_covidcast_meta  info   
17                   starting thread  compute_covidcast_meta  info   
18                   starting thread  compute_covidcast_meta  info   
22                  

In [9]:
all_df

Unnamed: 0,event,logger,level,timestamp,thread,pair,run_time,run_number,metadata_calculation_interval_in_seconds,metadata_update_interval_in_seconds,total_runtime_in_seconds
1,starting thread,compute_covidcast_meta,info,2022-01-01 04:00:37.116891+00:00,MetacacheThread-0,,0 days 00:00:00.004137,0,,,
9,starting pair,compute_covidcast_meta,info,2022-01-01 04:00:37.121028+00:00,MetacacheThread-0,"(chng, smoothed_adj_outpatient_covid)",0 days 00:10:24.751151,0,,,
388,starting pair,compute_covidcast_meta,info,2022-01-01 04:11:01.872179+00:00,MetacacheThread-0,"(indicator-combination, nmf_day_doc_fbs_ght)",0 days 00:00:03.934740,0,,,
389,starting pair,compute_covidcast_meta,info,2022-01-01 04:11:05.806919+00:00,MetacacheThread-0,"(jhu-csse, confirmed_7dav_cumulative_num)",0 days 00:02:19.613091,0,,,
397,starting pair,compute_covidcast_meta,info,2022-01-01 04:13:25.420010+00:00,MetacacheThread-0,"(jhu-csse, deaths_7dav_cumulative_num)",0 days 00:02:19.707665,0,,,
...,...,...,...,...,...,...,...,...,...,...,...
447,starting pair,compute_covidcast_meta,info,2022-01-31 04:52:22.577096+00:00,MetacacheThread-9,"(safegraph, median_home_dwell_time_7dav)",0 days 00:01:48.561806,39,,,
453,starting pair,compute_covidcast_meta,info,2022-01-31 04:54:11.138902+00:00,MetacacheThread-9,"(usa-facts, confirmed_7dav_cumulative_prop)",0 days 00:00:51.405843,39,,,
460,starting pair,compute_covidcast_meta,info,2022-01-31 04:55:02.544745+00:00,MetacacheThread-9,"(usa-facts, deaths_7dav_cumulative_num)",0 days 00:00:38.906423,39,,,
466,starting pair,compute_covidcast_meta,info,2022-01-31 04:55:41.451168+00:00,MetacacheThread-9,"(usa-facts, deaths_incidence_num)",0 days 00:00:46.730763,39,,,
