Skip to content

Commit

Permalink
Merge 909f7c6 into e6bffcb
Browse files Browse the repository at this point in the history
  • Loading branch information
Ocre42 committed Mar 19, 2019
2 parents e6bffcb + 909f7c6 commit ab452c0
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 50 deletions.
43 changes: 21 additions & 22 deletions statsrunner/gitaggregate-publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@

GITOUT_DIR = os.environ.get('GITOUT_DIR') or 'gitout'

# Only aggregate certain json stats files at publisher level
# These should be small stats files that will not consume large amounts of
# Only aggregate certain json stats files at publisher level
# These should be small stats files that will not consume large amounts of
# memory/disk space if aggregated over time
whitelisted_stats_files = [
whitelisted_stats_files = [
'activities',
'activity_files',
'bottom_hierarchy',
Expand All @@ -29,56 +29,55 @@
'latest_transaction_date',
'transaction_dates_hash',
'most_recent_transaction_date'
]
]

# Set bool if the 'dated' argument has been used in calling this script
dated = len(sys.argv) > 1 and sys.argv[1] == 'dated'

# Load the reference of commits to dates
# Load the reference of commits to dates
if dated:
gitdates = json.load(open('gitdate.json'))

# Loop over folders in the 'commits' directory
# Variable commit will be the commit hash
for commit in os.listdir(os.path.join(GITOUT_DIR, 'commits')):
print "gitaggregate-publisher for commit {}".format(commit)

for publisher in os.listdir(os.path.join(GITOUT_DIR, 'commits', commit, 'aggregated-publisher')):
print "{0} Currently looping over publisher {1}".format(str(datetime.datetime.now()), publisher)

# Set output directory for this publisher and attempt to make the directory. Pass if it already exists
git_out_dir = os.path.join(GITOUT_DIR,'gitaggregate-publisher-dated' if dated else 'gitaggregate-publisher', publisher)
git_out_dir = os.path.join(GITOUT_DIR, 'gitaggregate-publisher-dated' if dated else 'gitaggregate-publisher', publisher)
try:
os.makedirs(git_out_dir)
except OSError:
pass

# Set an output dictionary for this publisher
total = defaultdict(dict)

if os.path.isdir(git_out_dir):
# Loop over the existing files in the output directory for this publisher and load them into the 'total' dictionary
for fname in os.listdir(git_out_dir):
if fname.endswith('.json'):
with open(os.path.join(git_out_dir, fname)) as fp:
total[fname[:-5]] = json.load(fp, parse_float=decimal.Decimal)
with open(os.path.join(git_out_dir, fname)) as filepath:
total[fname[:-5]] = json.load(filepath, parse_float=decimal.Decimal)

# Loop over the whitelisted states files and add current values to the 'total' dictionary
for statname in whitelisted_stats_files:
path = os.path.join(GITOUT_DIR, 'commits', commit, 'aggregated-publisher', publisher, statname+'.json')
if os.path.isfile(path):
with open(path) as fp:
k = statname
if not commit in total[k]:
v = json.load(fp, parse_float=decimal.Decimal)
with open(path) as filepath:
if commit not in total[statname]:
statfile = json.load(filepath, parse_float=decimal.Decimal)
if dated:
if commit in gitdates:
total[k][gitdates[commit]] = v
total[statname][gitdates[commit]] = statfile
else:
total[k][commit] = v
total[statname][commit] = statfile

# Write data from the 'total' dictionary to a temporary file, then rename
for k,v in total.items():
with open(os.path.join(git_out_dir, k+'.json.new'), 'w') as fp:
json.dump(v, fp, sort_keys=True, indent=2, default=decimal_default)
os.rename(os.path.join(git_out_dir, k+'.json.new'), os.path.join(git_out_dir, k+'.json'))
for statname, statfile in total.items():
with open(os.path.join(git_out_dir, statname + '.json.new'), 'w') as filepath:
json.dump(statfile, filepath, sort_keys=True, indent=2, default=decimal_default)
os.rename(os.path.join(git_out_dir, statname + '.json.new'), os.path.join(git_out_dir, statname+'.json'))
55 changes: 27 additions & 28 deletions statsrunner/gitaggregate.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from collections import defaultdict
from common import decimal_default
import decimal
import json
import os
import os
import sys
from common import decimal_default

# Set value for the gitout directory
GITOUT_DIR = os.environ.get('GITOUT_DIR') or 'gitout'
Expand All @@ -14,8 +13,8 @@
git_out_dir = os.path.join(GITOUT_DIR, 'gitaggregate-dated' if dated else 'gitaggregate')

# Exclude some json stats files from being aggregated
# These are typically the largest stats files that would consume large amounts of
# memory/disk space if aggregated over time
# These are typically the largest stats files that would consume large amounts
# of memory/disk space if aggregated over time
whitelisted_stats_files = [
'activities',
'activity_files',
Expand All @@ -31,10 +30,10 @@
'unique_identifiers',
'validation',
'versions',
'teststat' # Extra 'stat' added as the test_gitaggregate.py assumes a file with this name is present
]
'teststat' # Extra 'stat' added as the test_gitaggregate.py assumes a file with this name is present
]

# Load the reference of commits to dates
# Load the reference of commits to dates
if dated:
gitdates = json.load(open('gitdate.json'))

Expand All @@ -54,38 +53,38 @@
for fname in os.listdir(os.path.join(GITOUT_DIR, 'commits', commit, 'aggregated')):
if not fname.endswith('.json'):
continue
k = fname[:-5] # remove '.json' from the filename

trimmed_name = fname[:-5] # remove '.json' from the filename
# Ignore certain files
if k not in whitelisted_stats_files:
continue
if trimmed_name not in whitelisted_stats_files:
continue

print 'Adding to {} for file: {}'.format('gitaggregate-dated' if dated else 'gitaggregate', fname)

commit_json_fname = os.path.join(GITOUT_DIR, 'commits', commit, 'aggregated', fname)

# Load the current file conents to memory, or set as an empty dictionary
if fname in git_out_files:
# FIXME: This is a possible cause of a memory issue in future, as the size of the aggregate file
# increases each time there is a new commit
with open(os.path.join(git_out_dir, fname)) as fp:
v = json.load(fp, parse_float=decimal.Decimal)
with open(os.path.join(git_out_dir, fname)) as filepath:
gitaggregate_json = json.load(filepath, parse_float=decimal.Decimal)
else:
v = {}
gitaggregate_json = {}

# If the commit that we are looping over is not already in the data for this file, then add it to the output
if not commit in v:
with open(commit_json_fname) as fp2:
v2 = json.load(fp2, parse_float=decimal.Decimal)
if commit not in gitaggregate_json:
with open(commit_json_fname) as commit_filepath:
commit_gitaggregate_json = json.load(commit_filepath, parse_float=decimal.Decimal)
if dated:
if commit in gitdates:
v[gitdates[commit]] = v2
gitaggregate_json[gitdates[commit]] = commit_gitaggregate_json
else:
v[commit] = v2
gitaggregate_json[commit] = commit_gitaggregate_json

# Write output to a temporary file, then rename
with open(os.path.join(git_out_dir, k+'.json.new'), 'w') as fp:
print 'Writing data to {}'.format(k)
json.dump(v, fp, sort_keys=True, indent=2, default=decimal_default)
print 'Renaming file {} to {}'.format(k+'.json.new', k+'.json')
os.rename(os.path.join(git_out_dir, k+'.json.new'), os.path.join(git_out_dir, k+'.json'))
with open(os.path.join(git_out_dir, trimmed_name + '.json.new'), 'w') as filepath:
print 'Writing data to {}'.format(trimmed_name)
json.dump(gitaggregate_json, filepath, sort_keys=True, indent=2, default=decimal_default)
print 'Renaming file {} to {}'.format(trimmed_name + '.json.new', trimmed_name + '.json')
os.rename(os.path.join(git_out_dir, trimmed_name + '.json.new'), os.path.join(git_out_dir, trimmed_name + '.json'))

0 comments on commit ab452c0

Please sign in to comment.