# Fast generation of healthy and unhealthy assessment counts by period

In [None]:
%matplotlib inline
import math

import numpy as np
from numba import njit
import matplotlib.pyplot as plt

from exetera.core.session import Session
from exetera.core.utils import Timer
from exetera.processing.date_time_helpers import\
    get_periods, generate_period_offset_map, get_days, get_period_offsets

### Helper functions

In [None]:
def human_readable_date(date):
    '''
    Transfer the float timestamp to a string representated date.
    '''
    if isinstance(date, float):
        date = datetime.fromtimestamp(date)
    return date.strftime("%Y-%m-%d")

## Fill in these parameters

In [None]:
from datetime import datetime, timedelta

filename = # filename
start_dt = # the starting datetime
end_dt = # the ending datetime

## Generate the summaries by seven day period

### Generate the seven day periods corresponding to the start and end dates

In [None]:
# Trunk the dates range into seven-day periods
start_ts = start_dt.timestamp()
end_ts = end_dt.timestamp()
periods = get_periods(end_dt, start_dt, 'week', -1)
periods.reverse()
print("Weekly periods from {} to {}".format(human_readable_date(periods[0]),
                                            human_readable_date(periods[-1])))

### Create the Session object
Note, you can also use `with Session() as s:` if you don't mind opening the session in each cell

In [None]:
s = Session()  # Open the ExeTera session 
src = s.open_dataset(filename, 'r', 'src')  # Open the dataset with read-only 'r' mode
assessment_df = src['assessments']  # Get the dataframe named 'assessments'

### Get the timestamp for each user signup

In [None]:
with Timer("Fetching assessment 'created_at' values"):  # Record the time usage
    created_at_dates = assessment_df['created_at'].data[:]   # Load data from 'created_at' field into memory

### Calculate on what day (relative to the start of the first period) each user signed up
`get_days` also returns a filter indicating whether a given record is within the date range of interest

In [None]:
with Timer("Calculating day offsets for assessments"):
    # Converts a field of timestamps into a field of relative elapsed days
    created_at_days, inrange = get_days(created_at_dates,
                                        start_date=periods[0].timestamp(),
                                        end_date=periods[-1].timestamp())

### Clear the days that fall outside of the specified range

In [None]:
with Timer("Filter out days that fall outside of the specified range"):
    created_at_days = created_at_days[inrange]

### Map the days to their corresponding periods
We generate the map using `generate_period_offset_map` and then pass it to `generate_period_offsets`

In [None]:
with Timer("Convert from days to periods"):
    created_at_periods = get_period_offsets(generate_period_offset_map(periods),
                                            created_at_days)
    # cat_counts = np.unique(cat_period, return_counts=True)

### Generate 'healthy' and 'unhealthy' assessment filters
Consider assessments with no health status to be 'healthy'

In [None]:
with Timer("Generate healthy and unhealthy status arrays"):
    unhealthy = assessment_df['health_status'].apply_filter(inrange) == 2  # Filter assessments according to data value defined in scheme
    healthy = assessment_df['health_status'].apply_filter(inrange) != 2

### Summarise unhealthy and healthy by period

In [None]:
with Timer("Summarise unhealthy and healthy by period"):
    healthy_counts = np.unique(created_at_periods[healthy.data[:]], return_counts=True)  # Count number of healthy assessments in each period
    all_healthy_counts = np.zeros(len(periods), dtype=np.int32)
    for k, v in zip(healthy_counts[0], healthy_counts[1]):
        all_healthy_counts[k] = v

    unhealthy_counts = np.unique(created_at_periods[unhealthy.data[:]], return_counts=True)  # Count number of unhealthy assessments
    all_unhealthy_counts = np.zeros(len(periods), dtype=np.int32)
    for k, v in zip(unhealthy_counts[0], unhealthy_counts[1]):
        all_unhealthy_counts[k] = v

## Generate the charts for healthy / unhealthy assessments

In [None]:
width = 1
widths = [width * d for d in range(len(periods))]

fig, ax = plt.subplots(2, 1, figsize=(10, 10))

negtests = ax[0].bar(widths, all_healthy_counts)
postests = ax[0].bar(widths, all_unhealthy_counts, bottom=all_healthy_counts)

ax[0].set_title("Assessment counts by week")
ax[0].set_xticks(np.arange(len(periods)-1))
ax[0].set_xticklabels([human_readable_date(d) for d in periods[:-1]], rotation=270)
ax[0].set_yticks(np.arange(10) * 1000000)
ax[0].set_yticklabels(i for i in range(10))
ax[0].legend((negtests, postests), ("'Healthy'", "'Unhealthy'"))
ax[0].set_xlabel("Week starting")
ax[0].set_ylabel("Million assessments per week")

all_counts = all_unhealthy_counts + all_healthy_counts
all_counts = np.where(all_counts == 0, 1, all_counts)

pos_fraction = all_unhealthy_counts / all_counts
pfbar = ax[1].bar(widths, pos_fraction, color="#ff7f0e")

ax[1].set_title("'Unhealthy' assessments as a fraction of assessments by week")
ax[1].set_xticks(np.arange(len(periods)-1))
ax[1].set_xticklabels([human_readable_date(d) for d in periods[:-1]], rotation=270)
ax[1].legend((pfbar,), ("Positive test fraction",))
ax[1].set_xlabel("Week starting")
ax[1].set_ylabel("'Unhealthy' assessment fraction")

fig.tight_layout(h_pad=2.5)
plt.show()

In [None]:
# Close the session manually; not needed if opening the session using 'with' statement.
s.close()