# Reformat the SC18 Mira dataset CSV

In [None]:
%matplotlib inline

In [None]:
import datetime
import pytz
import pandas

In [None]:
orig_df = pandas.read_csv('alcf-tokio-results-2_14_17-2_15_18.csv.gz', dtype=str)#float_precision='high')
new_df = pandas.read_csv('mira-summaries_2017-02-14_2018-02-15.csv.gz')

In [None]:
print("Which columns are inconsistent?\n")
_tmp_list = list(new_df.columns)
for col in list(orig_df.columns):
    if col not in _tmp_list:
        print("%s in orig_df; not in new_df" % col)
        
_tmp_list = list(orig_df.columns)
for col in list(new_df.columns):
    if col not in _tmp_list:
        print("%s in new_df; not in orig_df" % col)

The original CSV had timestamps formatted as strings in the Chicago time zone

In [None]:
orig_df['_datetime_start'].head()

The new CSV's epoch timestamps were made without awareness that the date strings in the original were measured in a different time zone than the system here.  This caused the timestamps to be off by a few hours.

In [None]:
new_df['_datetime_start'].head()

This is how we localize the string-based timestamps and convert them into true (UTC-based) epoch timestamps.

In [None]:
tz_chicago = pytz.timezone("America/Chicago")
for date_str in orig_df['_datetime_start'].head().values:
    datetime_obj = datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
    datetime_obj_tz = tz_chicago.localize(datetime_obj)
    print(date_str, int(datetime_obj_tz.timestamp()))

In [None]:
iloc = 0
ndiffrows = 0
nsamerows = 0
for key, val in orig_df.iloc[iloc].iteritems():
    if key not in new_df.iloc[iloc]:
        print("%s does not exist" % key)
        continue
    if key[0] == "_":
        continue
        
    if val != new_df[key].iloc[iloc]:
        print("%s differs (orig=%s, new=%s)" % (key, val, new_df[key].iloc[iloc]))
        ndiffrows += 1
    else:
        nsamerows += 1

print("%d keys differ" % ndiffrows)
print("%d keys same" % nsamerows)

In [None]:
tz_chicago = pytz.timezone("America/Chicago")

new_vals = {
    '_datetime_start': [],
    '_datetime_end': []
}
for col_name in new_vals:
    for date_str in orig_df[col_name].values:
        datetime_obj = datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
        datetime_obj_tz = tz_chicago.localize(datetime_obj)
        new_vals[col_name].append(int(datetime_obj_tz.timestamp()))

In [None]:
for col_name in new_vals:
    orig_df[col_name] = new_vals[col_name]

In [None]:
orig_df.drop(columns=['Unnamed: 0', 'index']).to_csv('alcf_fixed.csv')

In [None]:
fixed_df = pandas.read_csv('alcf_fixed.csv', index_col=0)
orig_df = pandas.read_csv('alcf-tokio-results-2_14_17-2_15_18.csv.gz')

In [None]:
for col in fixed_df.columns:
    if not (orig_df[col] == fixed_df[col]).all():
        print(col, (orig_df[col] != fixed_df[col]).sum())
        if col[0] != "_":
            for index, value in orig_df[col].iteritems():
                if value != fixed_df[col].loc[index] and not pandas.isna(value) and not pandas.isna(fixed_df[col].loc[index]):
                    print(index, value, fixed_df[col].loc[index])

In [None]:
orig_df['darshan_agg_perf_by_slowest_posix'] - fixed_df['darshan_agg_perf_by_slowest_posix']

In [None]:
import numpy

In [None]:
numpy.nan == numpy.nan