Skip to content

Commit

Permalink
Merge pull request #154 from IATI/master
Browse files Browse the repository at this point in the history
Merge master into live: Comprehensiveness budget-not-provided validation
  • Loading branch information
samuele-mattiuzzo authored Mar 26, 2019
2 parents eee5442 + 9df9019 commit bf39310
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 89 deletions.
2 changes: 1 addition & 1 deletion get_stats.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ set -e

mkdir gitout
for f in ckan gitdate; do
curl --compressed "http://dashboard.iatistandard.org/stats/${f}.json" > gitout/${f}.json
curl --compress "http://dashboard.iatistandard.org/stats/${f}.json" > gitout/${f}.json
done

mkdir stats-blacklist
Expand Down
34 changes: 22 additions & 12 deletions stats/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,9 +895,19 @@ def forwardlooking_activities_with_budgets(self, date_code_runs=None):

@returns_numberdict
def forwardlooking_activities_with_budget_not_provided(self, date_code_runs=None):
"""
Number of activities with the budget_not_provided attribute for this year and the following 2 years.
Note activities excluded according if they meet the logic in _forwardlooking_exclude_in_calculations()
Input:
date_code_runs -- a date object for when this code is run
Returns:
dictionary containing years with binary value if this activity is current and has the budget_not_provided attribute
"""
date_code_runs = date_code_runs if date_code_runs else self.now.date()
this_year = int(date_code_runs.year)
bnp = self._budget_not_provided() is not None
bnp = self._budget_not_provided() is not None
return {year: int(self._forwardlooking_is_current(year) and bnp > 0 and not bool(self._forwardlooking_exclude_in_calculations(year=year, date_code_runs=date_code_runs)))
for year in range(this_year, this_year+3)}

Expand Down Expand Up @@ -1033,9 +1043,8 @@ def is_text_in_element(elementName):
'transaction_currency': all_true_and_not_empty(x.xpath('value/@value-date') and x.xpath('../@default-currency|./value/@currency') for x in self.element.findall('transaction')),
'transaction_traceability': all_true_and_not_empty(x.xpath('provider-org/@provider-activity-id') for x in self.element.xpath('transaction[transaction-type/@code="{}"]'.format(self._incoming_funds_code()))) or
self._is_donor_publisher(),
'budget': (
self.element.findall('budget') or
self._budget_not_provided() is not None),
'budget': self.element.findall('budget'),
'budget_not_provided': self._budget_not_provided() is not None,
'contact-info': self.element.findall('contact-info/email'),
'location': self.element.xpath('location/point/pos|location/name|location/description|location/location-administrative'),
'location_point_pos': self.element.xpath('location/point/pos'),
Expand Down Expand Up @@ -1139,14 +1148,15 @@ def empty_or_percentage_sum_is_100(path, by_vocab=False):
),
'budget': (
bools['budget'] and
(all(
valid_date(budget.find('period-start')) and
valid_date(budget.find('period-end')) and
valid_date(budget.find('value')) and
valid_value(budget.find('value'))
for budget in bools['budget']) or
((len(self.element.findall('budget')) == 0) and
self._budget_not_provided() is not None))),
all(
valid_date(budget.find('period-start')) and
valid_date(budget.find('period-end')) and
valid_date(budget.find('value')) and
valid_value(budget.find('value'))
for budget in bools['budget'])),
'budget_not_provided': (
bools['budget_not_provided'] and
str(self._budget_not_provided()) in CODELISTS[self._major_version()]['BudgetNotProvided']),
'location_point_pos': all_true_and_not_empty(
valid_coords(x.text) for x in bools['location_point_pos']),
'sector_dac': (
Expand Down
37 changes: 11 additions & 26 deletions stats/tests/test_budget_not_provided.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from lxml import etree

from collections import defaultdict, OrderedDict
from stats.common.decorators import *
from stats.dashboard import ActivityStats

class MockActivityStats(ActivityStats):
def __init__(self, major_version='2'):
self.major_version = major_version
return super(MockActivityStats, self).__init__()

def _major_version(self):
return self.major_version

def test_budget_not_provided_works():
activity_stats = ActivityStats()
activity_stats = MockActivityStats()
activity_stats.element = etree.fromstring('''
<iati-activity budget-not-provided="1">
</iati-activity>
Expand All @@ -14,7 +20,7 @@ def test_budget_not_provided_works():


def test_budget_not_provided_fails():
activity_stats = ActivityStats()
activity_stats = MockActivityStats()
activity_stats.element = etree.fromstring('''
<iati-activity>
</iati-activity>
Expand All @@ -23,30 +29,9 @@ def test_budget_not_provided_fails():


def test_budget_validation_bools():
activity_stats = ActivityStats()
activity_stats = MockActivityStats()
activity_stats.element = etree.fromstring('''
<iati-activity budget-not-provided="3">
</iati-activity>
''')
assert (len(activity_stats.element.findall('budget')) == 0)



class CommonSharedElements(object):
blank = False


class ActivityStats(CommonSharedElements):
""" Stats calculated on a single iati-activity. """
element = None
blank = False
strict = False # (Setting this to true will ignore values that don't follow the schema)
context = ''
comprehensiveness_current_activity_status = None
now = datetime.datetime.now()

def _budget_not_provided(self):
if self.element.attrib.get('budget-not-provided'):
return int(self.element.attrib.get('budget-not-provided'))
else:
return None
6 changes: 6 additions & 0 deletions stats/tests/test_comprehensiveness.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def test_comprehensiveness_empty(major_version):
'transaction_currency': 0,
'transaction_traceability': 0,
'budget': 0,
'budget_not_provided': 0,
'contact-info': 0,
'location': 0,
'location_point_pos': 0,
Expand Down Expand Up @@ -319,6 +320,7 @@ def test_comprehensiveness_full(major_version):
'transaction_currency': 1,
'transaction_traceability': 1,
'budget': 1,
'budget_not_provided': 0,
'contact-info': 1,
'location': 1,
'location_point_pos': 1,
Expand Down Expand Up @@ -393,6 +395,7 @@ def test_comprehensiveness_other_passes(major_version):
'transaction_currency': 1,
'transaction_traceability': 0,
'budget': 0,
'budget_not_provided': 0,
'contact-info': 0,
'location': 0,
'location_point_pos': 0,
Expand Down Expand Up @@ -1514,6 +1517,7 @@ def test_comprehensiveness_dac_sector_codes_v2(major_version):
'transaction_currency': 1,
'transaction_traceability': 1,
'budget': 1,
'budget_not_provided': 0,
'contact-info': 1,
'location': 1,
'location_point_pos': 1,
Expand Down Expand Up @@ -1605,6 +1609,7 @@ def test_comprehensiveness_dac_sector_codes_v2_incomplete(major_version):
'transaction_currency': 1,
'transaction_traceability': 1,
'budget': 1,
'budget_not_provided': 0,
'contact-info': 1,
'location': 1,
'location_point_pos': 1,
Expand Down Expand Up @@ -1689,6 +1694,7 @@ def test_comprehensiveness_v1_returns_false(major_version):
'transaction_currency': 1,
'transaction_traceability': 1,
'budget': 1,
'budget_not_provided': 0,
'contact-info': 1,
'location': 1,
'location_point_pos': 1,
Expand Down
43 changes: 21 additions & 22 deletions statsrunner/gitaggregate-publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@

GITOUT_DIR = os.environ.get('GITOUT_DIR') or 'gitout'

# Only aggregate certain json stats files at publisher level
# These should be small stats files that will not consume large amounts of
# Only aggregate certain json stats files at publisher level
# These should be small stats files that will not consume large amounts of
# memory/disk space if aggregated over time
whitelisted_stats_files = [
whitelisted_stats_files = [
'activities',
'activity_files',
'bottom_hierarchy',
Expand All @@ -29,56 +29,55 @@
'latest_transaction_date',
'transaction_dates_hash',
'most_recent_transaction_date'
]
]

# Set bool if the 'dated' argument has been used in calling this script
dated = len(sys.argv) > 1 and sys.argv[1] == 'dated'

# Load the reference of commits to dates
# Load the reference of commits to dates
if dated:
gitdates = json.load(open('gitdate.json'))

# Loop over folders in the 'commits' directory
# Variable commit will be the commit hash
for commit in os.listdir(os.path.join(GITOUT_DIR, 'commits')):
print "gitaggregate-publisher for commit {}".format(commit)

for publisher in os.listdir(os.path.join(GITOUT_DIR, 'commits', commit, 'aggregated-publisher')):
print "{0} Currently looping over publisher {1}".format(str(datetime.datetime.now()), publisher)

# Set output directory for this publisher and attempt to make the directory. Pass if it already exists
git_out_dir = os.path.join(GITOUT_DIR,'gitaggregate-publisher-dated' if dated else 'gitaggregate-publisher', publisher)
git_out_dir = os.path.join(GITOUT_DIR, 'gitaggregate-publisher-dated' if dated else 'gitaggregate-publisher', publisher)
try:
os.makedirs(git_out_dir)
except OSError:
pass

# Set an output dictionary for this publisher
total = defaultdict(dict)

if os.path.isdir(git_out_dir):
# Loop over the existing files in the output directory for this publisher and load them into the 'total' dictionary
for fname in os.listdir(git_out_dir):
if fname.endswith('.json'):
with open(os.path.join(git_out_dir, fname)) as fp:
total[fname[:-5]] = json.load(fp, parse_float=decimal.Decimal)
with open(os.path.join(git_out_dir, fname)) as filepath:
total[fname[:-5]] = json.load(filepath, parse_float=decimal.Decimal)

# Loop over the whitelisted states files and add current values to the 'total' dictionary
for statname in whitelisted_stats_files:
path = os.path.join(GITOUT_DIR, 'commits', commit, 'aggregated-publisher', publisher, statname+'.json')
if os.path.isfile(path):
with open(path) as fp:
k = statname
if not commit in total[k]:
v = json.load(fp, parse_float=decimal.Decimal)
with open(path) as filepath:
if commit not in total[statname]:
statfile = json.load(filepath, parse_float=decimal.Decimal)
if dated:
if commit in gitdates:
total[k][gitdates[commit]] = v
total[statname][gitdates[commit]] = statfile
else:
total[k][commit] = v
total[statname][commit] = statfile

# Write data from the 'total' dictionary to a temporary file, then rename
for k,v in total.items():
with open(os.path.join(git_out_dir, k+'.json.new'), 'w') as fp:
json.dump(v, fp, sort_keys=True, indent=2, default=decimal_default)
os.rename(os.path.join(git_out_dir, k+'.json.new'), os.path.join(git_out_dir, k+'.json'))
for statname, statfile in total.items():
with open(os.path.join(git_out_dir, statname + '.json.new'), 'w') as filepath:
json.dump(statfile, filepath, sort_keys=True, indent=2, default=decimal_default)
os.rename(os.path.join(git_out_dir, statname + '.json.new'), os.path.join(git_out_dir, statname+'.json'))
55 changes: 27 additions & 28 deletions statsrunner/gitaggregate.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from collections import defaultdict
from common import decimal_default
import decimal
import json
import os
import os
import sys
from common import decimal_default

# Set value for the gitout directory
GITOUT_DIR = os.environ.get('GITOUT_DIR') or 'gitout'
Expand All @@ -14,8 +13,8 @@
git_out_dir = os.path.join(GITOUT_DIR, 'gitaggregate-dated' if dated else 'gitaggregate')

# Exclude some json stats files from being aggregated
# These are typically the largest stats files that would consume large amounts of
# memory/disk space if aggregated over time
# These are typically the largest stats files that would consume large amounts
# of memory/disk space if aggregated over time
whitelisted_stats_files = [
'activities',
'activity_files',
Expand All @@ -31,10 +30,10 @@
'unique_identifiers',
'validation',
'versions',
'teststat' # Extra 'stat' added as the test_gitaggregate.py assumes a file with this name is present
]
'teststat' # Extra 'stat' added as the test_gitaggregate.py assumes a file with this name is present
]

# Load the reference of commits to dates
# Load the reference of commits to dates
if dated:
gitdates = json.load(open('gitdate.json'))

Expand All @@ -54,38 +53,38 @@
for fname in os.listdir(os.path.join(GITOUT_DIR, 'commits', commit, 'aggregated')):
if not fname.endswith('.json'):
continue
k = fname[:-5] # remove '.json' from the filename

trimmed_name = fname[:-5] # remove '.json' from the filename
# Ignore certain files
if k not in whitelisted_stats_files:
continue
if trimmed_name not in whitelisted_stats_files:
continue

print 'Adding to {} for file: {}'.format('gitaggregate-dated' if dated else 'gitaggregate', fname)

commit_json_fname = os.path.join(GITOUT_DIR, 'commits', commit, 'aggregated', fname)

# Load the current file conents to memory, or set as an empty dictionary
if fname in git_out_files:
# FIXME: This is a possible cause of a memory issue in future, as the size of the aggregate file
# increases each time there is a new commit
with open(os.path.join(git_out_dir, fname)) as fp:
v = json.load(fp, parse_float=decimal.Decimal)
with open(os.path.join(git_out_dir, fname)) as filepath:
gitaggregate_json = json.load(filepath, parse_float=decimal.Decimal)
else:
v = {}
gitaggregate_json = {}

# If the commit that we are looping over is not already in the data for this file, then add it to the output
if not commit in v:
with open(commit_json_fname) as fp2:
v2 = json.load(fp2, parse_float=decimal.Decimal)
if commit not in gitaggregate_json:
with open(commit_json_fname) as commit_filepath:
commit_gitaggregate_json = json.load(commit_filepath, parse_float=decimal.Decimal)
if dated:
if commit in gitdates:
v[gitdates[commit]] = v2
gitaggregate_json[gitdates[commit]] = commit_gitaggregate_json
else:
v[commit] = v2
gitaggregate_json[commit] = commit_gitaggregate_json

# Write output to a temporary file, then rename
with open(os.path.join(git_out_dir, k+'.json.new'), 'w') as fp:
print 'Writing data to {}'.format(k)
json.dump(v, fp, sort_keys=True, indent=2, default=decimal_default)
print 'Renaming file {} to {}'.format(k+'.json.new', k+'.json')
os.rename(os.path.join(git_out_dir, k+'.json.new'), os.path.join(git_out_dir, k+'.json'))
with open(os.path.join(git_out_dir, trimmed_name + '.json.new'), 'w') as filepath:
print 'Writing data to {}'.format(trimmed_name)
json.dump(gitaggregate_json, filepath, sort_keys=True, indent=2, default=decimal_default)
print 'Renaming file {} to {}'.format(trimmed_name + '.json.new', trimmed_name + '.json')
os.rename(os.path.join(git_out_dir, trimmed_name + '.json.new'), os.path.join(git_out_dir, trimmed_name + '.json'))

0 comments on commit bf39310

Please sign in to comment.