In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from src.aou_summary import *

start_date = '2017-05-21'
end_date = '2018-06-02'
interval = 'week'

raw_data_path = '../../data/IntervalTotals/20180608_dates_all.txt'

summary = AouSummary(raw_data_path, start_date, end_date, interval)
cumulative_totals = summary.cumulative_totals

dates = sorted(cumulative_totals.keys())
weeks = [i for i, date in enumerate(dates)]
full_participants = [cumulative_totals[date] for date in dates]

d = {'week': weeks, 'full_participants': full_participants, 'date': dates}
fp_counts_by_date = pd.DataFrame(data=d)
fp_counts_by_date[-15:]

In [None]:
sns.lmplot(
    x='week', y='full_participants', data=fp_counts_by_date, order=2, 
    ci=None, size=8, scatter_kws={'s': 100}
)
plt.xlabel('Weeks since first full participant enrollment', fontsize=16)
plt.xlim(0, None)
plt.ylabel('Full participants', fontsize=16)
plt.ylim(0, None)
plt.title('Actual full participant growth, ' + start_date + ' - ' + end_date, fontsize=18)
plt.show()

In [None]:
weeks_in_model = 7

x = np.array(weeks[:-weeks_in_model])
y = np.array(full_participants[:-weeks_in_model])
polynomial = np.polyfit(x, y, 2)
growth_model = np.poly1d(polynomial)

poly = [str(round(x, 2)) for x in polynomial]

model_equation = 'y = ' + poly[0] + 'x^2 + ' + poly[1] + 'x + ' + poly[2]
model_equation

In [None]:
last_week_for_regression = weeks[-weeks_in_model]
future_weeks = list(np.arange(26) + last_week_for_regression) # 26 week (6 month) forecast

# Get projected full participant counts for future weeks,
# and round those values.
projected_fp_counts = growth_model(future_weeks)
projected_fp_counts = [int(round(x)) for x in projected_fp_counts]

# Show actual and projected (ap) values on the same chart
ap_fp_counts = full_participants + projected_fp_counts
ap_weeks = weeks + future_weeks

# A list of hues (colors) to apply to each item in our plot
hues = ['Actual']*len(full_participants) + ['Predicted']*len(projected_fp_counts)

ap_d = {'week': ap_weeks, 'full_participants': ap_fp_counts, 'Legend': hues}
ap_fp_counts_by_date = pd.DataFrame(data=ap_d)

sns.lmplot(
    x='week', y='full_participants', hue='Legend', data=ap_fp_counts_by_date, order=2, 
    ci=None, size=8, truncate=True
)
plt.xlabel('Weeks since first full participant enrollment', fontsize=16)
plt.xlim(0, None)
plt.ylabel('Full participants', fontsize=16)
plt.ylim(0, None)
plt.title('Actual and predicted full participant growth', fontsize=18)
plt.show()

In [None]:
weeks_until = int(round(np.roots(growth_model - 1_000_000)[0]))
start_datetime = datetime.strptime(start_date, '%Y-%M-%d')
time_needed = timedelta(weeks=weeks_until)
date_of_1_million_full_participants = (start_datetime + time_needed).strftime("%Y-%m-%d")

print(date_of_1_million_full_participants)