In [None]:
from src.utils import *

start_date = '2017-10-01'
end_date = '2017-10-31'
hpo_of_interest = ''
interval = 'month' # One of: day, week, month

# TODO: Document SQL query used to generate values in dates.txt
lists = sql_to_lists('../../data/20171106_dates_all.txt')
headers = lists[0]
rows = lists[1:]

'''
This is the technical, precise definition of "full participant", per
https://github.com/all-of-us/raw-data-repository/blob/08ef0e9ea9ee0ad2c7e19d6748371c775bb1d63a/rest-api/dao/participant_summary_dao.py#L39

 CASE WHEN (consent_for_study_enrollment = :submitted
                   AND consent_for_electronic_health_records = :submitted
                   AND num_completed_baseline_ppi_modules = :num_baseline_ppi_modules
                   AND physical_measurements_status = :completed
                   AND samples_to_isolate_dna = :received)
             THEN :full_participant
'''

# Count of participants who reached "full participant"
# status on a given day.
# 
# For example (not real data):
# {
#    'PITT': {
#       '2017-10-01': 50, 
#       '2017-10-02': 53,
#       ...
#    },
#    ...
# }
counts_by_site_per_day = {}

for row_columns in rows:
    # Each item in "rows" represents a participant -- particularly,
    # the set of dates when that participant completed a specified event
    # in the enrollment lifecycle
    hpo = row_columns[0]
    
    '''
    # Dead code, but perhaps useful documentation
    
    consent_for_study_enrollment_time = row_columns[2]
    consent_for_electronic_health_records_time = row_columns[3]
    physical_measurements_time = row_columns[4]
    
    # These correspond to "num_completed_baseline_ppi_modules"
    questionnaire_on_the_basics = row_columns[5]
    questionnaire_on_overall_health_time = row_columns[6]
    questionnaire_on_lifestyle_time = row_columns[7]
    
    # We assume these three samples comprise "samples_to_isolate_dna"
    sample_status_1ed04_time = row_columns[11] # EDTA DNA 4 mL 
    sample_status_1ed10_time = row_columns[12] # 1st EDTA DNA 10 mL 
    sample_status_2ed10_time = row_columns[13] # 2nd EDTA DNA 10 mL 
    '''
    
    # Gather all the dates of each lifecycle phase that needs to be 
    # passed in order to become a full participant
    dates = row_columns[2:8] + row_columns[11:14]
    
    # Get the latest -- the most recent -- of those dates
    most_recent_date = sorted(dates)[-1]
    
    # Increment by 1 the number of full participants enrolled 
    # in this HPO (i.e. high-level recruitment origin, "site") 
    # on this date
    if hpo in counts_by_site_per_day:
        counted_days = counts_by_site_per_day[hpo]
        if most_recent_date in counted_days:
            counts_by_site_per_day[hpo][most_recent_date] += 1
        else:
            counts_by_site_per_day[hpo][most_recent_date] = 1
    else:
        counts_by_site_per_day[hpo] = {most_recent_date: 1}

# print(counts_by_site_per_day)

# Dates in *dates.txt seem to be past-shifted by one day
# relative to those in Dashboard and HealthPro.
# The latter set seems to be correct -- e.g. weekends are 0-filled -- 
# so we account for that with some shifting here. 
# See also later uses of "dates[:-1]", dates[1:], etc.
shifted_end_date = previous_date(end_date)

# Limit counts to only those in the requested date range
all_dates = date_range(start_date, shifted_end_date)
truncated_counts = {}
for hpo in counts_by_site_per_day:
    truncated_counts[hpo] = {}
    for date in all_dates:
        if date not in counts_by_site_per_day[hpo]:
            # Fill in missing dates with '0'
            truncated_counts[hpo][date] = 0
        else:
            truncated_counts[hpo][date] = counts_by_site_per_day[hpo][date]
counts_by_site_per_day = truncated_counts

# Roll up days into weekly or monthly bins, if requested
if interval != 'day':
    counts_by_site_per_interval = {}
    for hpo in counts_by_site_per_day:
        counts_by_site_per_interval[hpo] = {}
        dates = list(counts_by_site_per_day[hpo].keys())
        if interval == 'week':
            dates = list(reversed(dates))
            # Here, weeks are considered simply seven-day periods
            # beginning with the requested 'end_date'.  
            # This interval aligns with Dashboard, 
            # but would probably be better labeled '7-day'.
            # 
            # TODO: Enable counting at the most recent Sunday. 
            # This would align with the calendrical week.
            days_by_weeks = n_sized_chunks(dates, 7)
            for days_by_week in days_by_weeks:
                week_date = days_by_week[0]
                week_total = 0
                for date in days_by_week:
                    week_total += counts_by_site_per_day[hpo][date]
                counts_by_site_per_interval[hpo][week_date] = week_total
        elif interval == 'month':
            # Note: these are calendrical months.
            # TODO: Add '30-day' interval, to align with Dashboard.
            # TODO: Reconcile 'month' counts with month-long 'day' range, and Dashboard.
            counts_by_month = {}
            for date in dates:
                day_count = counts_by_site_per_day[hpo][date]
                year, bare_month, day = date.split('-')
                month = year + '-' + bare_month # e.g. 2017-10
                if month not in counts_by_month:
                    counts_by_month[month] = day_count
                else:
                    counts_by_month[month] += day_count
            counts_by_site_per_interval[hpo] = counts_by_month
else:
    counts_by_site_per_interval = counts_by_site_per_day

print('counts_by_site_per_interval')
print(counts_by_site_per_interval)

# Print a CSV table with first row listing dates, 
# and each subsequent row listing counts per date by site
output = []

totals = {}
for hpo in counts_by_site_per_interval:
    dates = sorted(counts_by_site_per_interval[hpo].keys())
    if hpo == 'PITT':
        # Get header row.  See note re past-shifting above.
        output.append('Recruitment origin,' + ','.join(dates[:-1]))
    # Get counts in same order as sorted dates.  See note re past-shifting above.
    counts = [counts_by_site_per_interval[hpo][date] for date in dates[1:]]
    totals[hpo] = 0
    for date in dates:
        totals[hpo] += counts_by_site_per_interval[hpo][date]
    counts.insert(0, hpo) 
    row = ','.join([str(value) for value in counts])
    if hpo_of_interest != '' and hpo != hpo_of_interest:
        # If we're analyzing only one HPO and this isn't it, skip it
        continue
    output.append(row)

print('Full participants by site per day')
print('Date range: ' + start_date + ' - ' + end_date)
print('')
if hpo_of_interest != '':
    print('Daily counts for ' + hpo_of_interest + ':')    
print('\n'.join(output))
if hpo_of_interest != '':
    print('')
    print('Total for ' + hpo_of_interest + ':')
    print(totals[hpo_of_interest])

    