## Github Analysis notebook
This notebook contains scripts to process github experiment results and plot them out (for single-threaded run)

In [3]:
import pandas as pd
import json
import zipfile

In [4]:
experimental_zip_path = '../results-general-dict.zip'

In [41]:
# load logic + pre-processing
def extract_per_file_stats_tuplex(job_stats):
        n_requests = len(job_stats['responses'])
        L =  [job_stats['responses'][i]['stats'] for i in range(n_requests)]
        for i in range(n_requests):
            req_uri = job_stats['responses'][i]['request'][0]
            L[i]['input_path'] = req_uri[:req_uri.rfind(':')]
            L[i]['req_uri'] = req_uri
            L[i]['duration'] = L[i]['request_total_time']
            L[i]['num_input_rows'] = L[i]['input']['total_input_row_count']
            L[i]['num_output_rows'] = L[i]['output']['normal']
        return L

def load_tuplex_path(fp):
    lines = fp.readlines()
    rows = [json.loads(line) for line in lines]

    data = []
    for row in rows:
        if 'python-baseline' not in path:
            ans = {'benchmark': 'github'}
            for k in ['benchmark', 'input_path', 'job_time_in_s', 'metrics', 'mode', 'options', 'output_path', 'scratch_path', 'startup_time_in_s']:
                ans[k] = row.get(k)
            try:
                ans['per_file_stats'] = extract_per_file_stats_tuplex(row['detailed_job_stats'])
            except:
                print('--- ERR: extract failed')
            row = ans
        data.append(row)
    return pd.DataFrame(data)

def load_tuplex_runs(zf):
    df_tplx = pd.DataFrame()
    for path in filter(lambda name: name.endswith('.ndjson'), zf.namelist()):
        print(f'loading {path}')
        df_tplx = pd.concat((df_tplx, load_tuplex_path(zf.open(path))))
    return df_tplx
        
def load_cc_baselines(zf):
    df_cc = pd.DataFrame()
    # the C++ baseline are directly stored as csv
    for path in filter(lambda name: name.endswith('.csv'), zf.namelist()):
        print(f'loading {path}')
        df_cc = pd.concat((df_cc, pd.read_csv(zf.open(path))))
    df_cc = df_cc[df_cc['mode'] != 'mode']

    for name in ['time_in_s', 'loading_time_in_s', 'total_time_in_s']:
        df_cc[name] = df_cc[name].astype(float)
    for name in ['input_row_count', 'output_row_count']:
        df_cc[name] = df_cc[name].astype(int)
    return df_cc

with zipfile.ZipFile(experimental_zip_path, 'r') as zf:
    # print(zf.namelist())
    df_cc = load_cc_baselines(zf)
   
    df_tplx = load_tuplex_runs(zf)
    
#     with myzip.open('document.txt') as myfile:
#         print(myfile.read())

loading results-general-dict/local-exp-llvm16/c++-baseline/github/best_results.csv
loading results-general-dict/local-exp-llvm16/c++-baseline/github/cjson_results.csv
loading results-general-dict/local-exp-llvm16/c++-baseline/github/yyjson_results.csv
loading results-general-dict/local-exp-llvm16/c++-baseline/github/cstruct_results.csv
loading results-general-dict/local-exp-llvm9/tuplex/github/hyper/results.ndjson
loading results-general-dict/local-exp-llvm9/tuplex/github/hyper-noopt/results.ndjson
loading results-general-dict/local-exp-llvm9/tuplex/github/nohyper/results.ndjson
loading results-general-dict/local-exp-llvm9/python-baseline/github/results.ndjson
--- ERR: extract failed
--- ERR: extract failed
--- ERR: extract failed
--- ERR: extract failed
loading results-general-dict/local-exp-llvm16/tuplex/github/hyper/results.ndjson
loading results-general-dict/local-exp-llvm16/tuplex/github/hyper-noopt/results.ndjson
loading results-general-dict/local-exp-llvm16/tuplex/github/nohyper

In [42]:
df_cc

Unnamed: 0,mode,input_path,output_path,time_in_s,loading_time_in_s,total_time_in_s,input_row_count,output_row_count
0,best,/hot/data/github_daily/2011-10-15.json,./local-exp/c++-baseline/github/best/output/pa...,0.060073,0.020326,21.773260,48899,1418
1,best,/hot/data/github_daily/2012-10-15.json,./local-exp/c++-baseline/github/best/output/pa...,0.193989,0.051573,21.773260,165692,5907
2,best,/hot/data/github_daily/2013-10-15.json,./local-exp/c++-baseline/github/best/output/pa...,0.411413,0.130098,21.773260,296456,10628
3,best,/hot/data/github_daily/2014-10-15.json,./local-exp/c++-baseline/github/best/output/pa...,1.542250,0.621012,21.773260,476393,16014
4,best,/hot/data/github_daily/2015-10-15.json,./local-exp/c++-baseline/github/best/output/pa...,1.385090,0.521614,21.773260,737714,25301
...,...,...,...,...,...,...,...,...
66,cstruct,/hot/data/github_daily/2017-10-15.json,./local-exp/c++-baseline/github/cstruct/output...,1.802470,0.529669,25.819719,910100,26107
67,cstruct,/hot/data/github_daily/2018-10-15.json,./local-exp/c++-baseline/github/cstruct/output...,3.494740,1.091120,25.819719,1522655,44404
68,cstruct,/hot/data/github_daily/2019-10-15.json,./local-exp/c++-baseline/github/cstruct/output...,5.122230,1.661090,25.819719,2134789,61337
69,cstruct,/hot/data/github_daily/2020-10-15.json,./local-exp/c++-baseline/github/cstruct/output...,7.403400,2.612160,25.819719,2963694,62354


In [43]:
df_tplx = df_tplx.reset_index().rename(columns={'index':'run', 'job_time_in_s':'total_time_in_s'})
df_tplx['run'] = df_tplx['run'] + 1
df_tplx.head()

Unnamed: 0,run,benchmark,input_path,total_time_in_s,metrics,mode,options,output_path,scratch_path,startup_time_in_s,per_file_stats
0,1,github,/hot/data/github_daily/*.json,171.377957,"{'generate_llvm_time_s': 0, 'llvm_compilation_...",tuplex,"{'tuplex.allowUndefinedBehavior': False, 'tupl...",./local-exp/tuplex/github/hyper/output,./local-exp/scratch,0.019958,"[{'hyper_active': True, 'input': {'fallback': ..."
1,2,github,/hot/data/github_daily/*.json,169.666286,"{'generate_llvm_time_s': 0, 'llvm_compilation_...",tuplex,"{'tuplex.allowUndefinedBehavior': False, 'tupl...",./local-exp/tuplex/github/hyper/output,./local-exp/scratch,0.020359,"[{'hyper_active': True, 'input': {'fallback': ..."
2,3,github,/hot/data/github_daily/*.json,170.050273,"{'generate_llvm_time_s': 0, 'llvm_compilation_...",tuplex,"{'tuplex.allowUndefinedBehavior': False, 'tupl...",./local-exp/tuplex/github/hyper/output,./local-exp/scratch,0.02002,"[{'hyper_active': True, 'input': {'fallback': ..."
3,4,github,/hot/data/github_daily/*.json,171.278456,"{'generate_llvm_time_s': 0, 'llvm_compilation_...",tuplex,"{'tuplex.allowUndefinedBehavior': False, 'tupl...",./local-exp/tuplex/github/hyper/output,./local-exp/scratch,0.020803,"[{'hyper_active': True, 'input': {'fallback': ..."
4,1,github,/hot/data/github_daily/*.json,170.569332,"{'generate_llvm_time_s': 0, 'llvm_compilation_...",tuplex,"{'tuplex.allowUndefinedBehavior': False, 'tupl...",./local-exp/tuplex/github/hyper-noopt/output,./local-exp/scratch,0.020239,"[{'hyper_active': True, 'input': {'fallback': ..."


In [27]:
df_cc.dtypes

mode                  object
input_path            object
output_path           object
time_in_s            float64
loading_time_in_s    float64
total_time_in_s      float64
input_row_count        int64
output_row_count       int64
dtype: object

In [27]:
!ls /home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/c++-baseline/github

best		  cjson		     cstruct		  yyjson
best_results.csv  cjson_results.csv  cstruct_results.csv  yyjson_results.csv


In [28]:
df = pd.DataFrame()

In [29]:
import glob

In [30]:
rpath = '/home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/c++-baseline/github'

In [31]:
for path in glob.glob(rpath + '/*.csv'):
    df = pd.concat((df, pd.read_csv(path)))

In [32]:
df.groupby('mode').mean(numeric_only=True)

best
cjson
cstruct
mode
yyjson


In [33]:
df['total_time_in_s'] = pd.to_numeric(df['total_time_in_s'], errors='coerce')

In [34]:
df.dropna().groupby('mode').mean()

  df.dropna().groupby('mode').mean()


Unnamed: 0_level_0,total_time_in_s
mode,Unnamed: 1_level_1
best,22.072186
cjson,155.714987
cstruct,25.833117
yyjson,73.006668


In [35]:
def load_tuplex():
    paths = [
             '/home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/python-baseline/github/results.ndjson',
             '/home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/tuplex/github/hyper/results.ndjson',
             '/home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/tuplex/github/hyper-noopt/results.ndjson',
             '/home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/tuplex/github/nohyper/results.ndjson',
            ]
    def extract_per_file_stats_tuplex(job_stats):
        n_requests = len(job_stats['responses'])
        L =  [job_stats['responses'][i]['stats'] for i in range(n_requests)]
        for i in range(n_requests):
            req_uri = job_stats['responses'][i]['request'][0]
            L[i]['input_path'] = req_uri[:req_uri.rfind(':')]
            L[i]['req_uri'] = req_uri
            L[i]['duration'] = L[i]['request_total_time']
            L[i]['num_input_rows'] = L[i]['input']['total_input_row_count']
            L[i]['num_output_rows'] = L[i]['output']['normal']
        return L
    data = []
    
    for path in paths:
        print(f'>>> processing {path}')
        with open(path) as fp:
            lines = fp.readlines()
            rows = [json.loads(line) for line in lines]
    
            print(len(lines))
            # row = rows[-1]
            for row in rows:
                if 'python-baseline' not in path:
                    ans = {'benchmark': 'github'}
                    for k in ['benchmark', 'input_path', 'job_time_in_s', 'metrics', 'mode', 'options', 'output_path', 'scratch_path', 'startup_time_in_s']:
                        ans[k] = row.get(k)
                    try:
                        ans['per_file_stats'] = extract_per_file_stats_tuplex(row['detailed_job_stats'])
                    except:
                        print('--- ERR: extract failed')
                    row = ans
                data.append(row)
                
            del row
            del rows
            del lines
    df = pd.DataFrame(data)        
    return df

In [36]:
df_tplx= load_tuplex()

>>> processing /home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/python-baseline/github/results.ndjson
5
>>> processing /home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/tuplex/github/hyper/results.ndjson
4
>>> processing /home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/tuplex/github/hyper-noopt/results.ndjson
4
>>> processing /home/leonhards/projects/tuplex-public/benchmarks/nextconf/hyperspecialization/github/local-exp/tuplex/github/nohyper/results.ndjson
4


In [37]:
df_tplx.groupby('output_path').mean()

  df_tplx.groupby('output_path').mean()


Unnamed: 0_level_0,job_time_in_s,startup_time_in_s,total_input_paths_size_in_bytes,total_input_rows,total_output_rows
output_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
./local-exp/python-baseline/github/output,138.595526,0.0,38076020000.0,11012665.0,294195.0
./local-exp/tuplex/github/hyper-noopt/output,170.382859,0.02013,,,
./local-exp/tuplex/github/hyper/output,171.051636,0.020238,,,
./local-exp/tuplex/github/nohyper/output,182.090354,0.020099,,,


In [38]:
df.dropna().groupby('mode').mean()

  df.dropna().groupby('mode').mean()


Unnamed: 0_level_0,total_time_in_s
mode,Unnamed: 1_level_1
best,22.072186
cjson,155.714987
cstruct,25.833117
yyjson,73.006668
