In [None]:
import dill

In [None]:
dill.load_session('user_and_course_dfs.db')

In [None]:
from utilities import *

In [None]:
from datetime import timedelta

In [None]:
%run other_graphing_utilities.ipynb

In [None]:
c.to_file = False

In [None]:
def get_first_and_last_activity(data):
    for u in data:
        yield u.day.iloc[0], u.day.iloc[-1]
        
def get_weekly_signups_and_dropouts_ratios(data):
    signups = []
    dropouts = []

    for u in data:
        signups.append(u.day.iloc[0])
        dropouts.append(u.day.iloc[-1])

    signups.sort()
    dropouts.sort()

    first = signups[0]
    last = dropouts[-1]

    bins = math.ceil((last - first).days / 7)

    weekly_populations = np.zeros(bins+2)

    one_week = timedelta(7)

    end_week = first

    sign_sections = []
    drop_sections = []

    for i in range(1, bins+2):
        end_week += one_week

        sign_week = [x for x in signups if x < end_week and x >= end_week - one_week]
        drop_week = [x for x in dropouts if x < end_week and x >= end_week - one_week]

        sign_sections.append(sign_week)
        drop_sections.append(drop_week)

        weekly_populations[i] = len(sign_week) - len(drop_week) + weekly_populations[i-1]

        # Don't divide by zero
        if not weekly_populations[i]: weekly_populations[i] = 1

    weekly_populations = np.delete(weekly_populations, 0)

    assert(len(weekly_populations) == bins+1 == len(sign_sections) == len(drop_sections))

    sign_percs = []
    drop_percs = []

    for i in range(bins+1):
        sign_rate = len(sign_sections[i]) / weekly_populations[i]
        drop_rate = len(drop_sections[i]) / weekly_populations[i]

        total_rate = sign_rate + drop_rate

        sign_percs.append(sign_rate / total_rate)
        drop_percs.append(drop_rate / total_rate)

    return pd.DataFrame(zip(sign_percs, drop_percs), columns=['Signup Rate', 'Dropout Rate'])

def get_first_last_activity(data):
    ## Get first time active dates and last time active dates

    # dictionaries for first and last activity divided into whether they occur on the same day
    days = [{} for _ in range(4)] 

    # indices for the dictionaries
    # F = First, L = Last
    # S = Same, D = Different
    FS, LS = 0, 1
    FD, LD = 2, 3

    # Last active dates are denoted as negative values and first active dates are positive
    for first, last in get_first_and_last_activity(data):
        first_days, last_days = days[FS : LS+1] if first == last else days[FD : LD+1]

        first_days[first] = first_days.get(first, 0) + 1
        last_days[last] = last_days.get(last, 0) - 1

    logs_df = pd.DataFrame(days).fillna(0)
    logs_df = logs_df.transpose()
    
    return logs_df
    
def plot_first_last_activity(logs_df):
    reset_style()
    # Increment value between yticks
    step = 5

    # Include max value in yticks
    max_y = int(max(logs_df.max(axis=0)))
    min_y = int(min(logs_df.min(axis=0)))

    max_y += 1 if (max_y % step == 0) else 0

    ax = logs_df.plot(kind='bar', stacked=True, figsize=(19, 9), width=0.8, yticks=[i for i in range(min_y, max_y, step)])

    # Change negative y ticks to positive because we are only concerned with magnitude
    ax.set(yticklabels=[abs(y) for y in plt.yticks()[0]], 
              xlabel="Date*", ylabel="Frequency", title="First Time Active vs Last Time Active")

    set_red_text_for_workshops(ax.get_xticklabels())

    # Set patterns for differnt day bars
    hatches = [p for p in ('', '', '////', '////') for i in range(len(logs_df))]
    for bar, hatch in zip(ax.patches, hatches):
        bar.set_hatch(hatch)

    # Add annotation to bottom left of graph
    plt.text(-0.6, -64, "*Workshop dates are marked in red")

    ax.legend(['First time active (same day)', 'Last time active (same day)', 
                    'First time active (different day)', 'Last time active (different day)'])

    save_or_display("First Time Active vs Last Time Active (Same vs Different)")

In [None]:
user_url_first_last = get_first_last_activity(user_urls_dfs)
plot_first_last_activity(user_url_first_last)

In [None]:
user_url_first_last = get_first_last_activity(ai_edx_dfs)
plot_first_last_activity(user_url_first_last)

In [None]:
time_delta_freqs = collections.defaultdict(int)

time_deltas = []

for first, last in get_first_and_last_activity(user_urls_dfs):
    delta = (last - first).days
    
    time_deltas.append(delta)
    time_delta_freqs[delta] += 1

time_delta_df = pd.DataFrame.from_dict(time_delta_freqs, orient='index').sort_index()

In [None]:
plot_broken_y_bar(
    time_delta_df, lims=[(0, 19), (140, 155)], 
    xlabel="Time Interval Between First and Last Activity", 
    ylabel="Number of students", 
    ylabel_loc=(-3.8, 24), figsize=(15, 5), breakline_len=.008
)

In [None]:
time_delta_freqs = collections.defaultdict(int)

time_deltas = []

for first, last in get_first_and_last_activity(ai_edx_dfs):
    delta = (last - first).days
    if delta == 57:
        print("foo")
    time_deltas.append(delta)
    time_delta_freqs[delta] += 1

time_delta_df = pd.DataFrame.from_dict(time_delta_freqs, orient='index').sort_index()

In [None]:
time_delta_df.plot(kind='bar')

In [None]:
reset_style()

tdf_df = pd.DataFrame(time_deltas)

ax = tdf_df.plot(kind='hist', density=True, cumulative=True, ec="white")

title="Cumulative Distribution of Time Between First and Last Activity"

ax.set(xlabel="Time Interval Between First and Last Activity", 
       ylabel="Percentage", title=title)

ax.get_legend().remove()

save_or_display(title)
    
tdf_df.describe()

In [None]:
drops_signs_df = get_weekly_signups_and_dropouts_ratios(user_urls_dfs)

ax = drops_signs_df.plot(kind='bar', width=0.8, align='edge')
ax.set(xlabel="Week #", ylabel="Ratio of Signup to Dropout Rates Based on Student Population")

save_or_display("Weekly Signup and Dropout Rate Ratios")

In [None]:
def dropoff_by_section_type(sections_in_order, normalized_dict=None, dict_func=lambda x: x):
    dropoff_by_section_type = collections.OrderedDict.fromkeys(sections_in_order, 0)

    for u in all_resource_dfs:
        last_resource = None
        last_index = -1

        for display_name in u.display_name.unique():
            try:
                index = resource_order.index(display_name)

                if index > last_index:
                    last_resource = display_name

            except ValueError:
                pass

        if last_resource:
            dropoff_by_section_type[dict_func(last_resource)] += 1

    if normalized_dict:
        for section, value in normalized_dict.items():
            dropoff_by_section_type[section] /= value

    return pd.Series(dropoff_by_section_type)

In [None]:
num_users_using_resources = collections.defaultdict(int)

for u in all_resource_dfs:
    for resource in resource_order:
        if resource in u.display_name.values:
            num_users_using_resources[resource] += 1
            
section_user_use = collections.defaultdict(int)

for resource, n_users in num_users_using_resources.items():
    section_user_use[section_of_resources[resource]] += n_users

reordering = [
    'Getting Started', 
    'Unit 1: Introduction to MIT App Inventor', 
    'Unit 2: Application Coding',
    'Unit 3: Programming Basics & Dialog', 
    'Unit 4: More Programming Basics', 
    'Unit 5: Alarm Clock Application', 
    'Unit 6: Audio & Video', 
    'Unit 7: Drawing Application', 
    'Unit 8: File',
    'Unit 9: Game', 
    'Unit 10: Device Location', 
    'Unit 11: Web Browsing',
    'Capstone Project'

]
section_series = pd.Series(section_user_use).reindex()
section_series = section_series.reindex(index=reordering)

In [None]:
dropoff_by_section = dropoff_by_section_type(section_order, dict_func = lambda x: section_of_resources[x])

In [None]:
dropoff_by_resource = dropoff_by_section_type(resource_order)
dropoff_by_resource = dropoff_by_resource[dropoff_by_resource != 0]

In [None]:
dropoff_section_ax = dropoff_by_section.plot(kind='bar', legend=False, title="Dropoffs by section")
dropoff_section_ax.set(ylabel='# Dropoffs', xlabel='Section')

c.to_file = True
save_or_display("Dropoff by section")

In [None]:
dropoff_resource_ax = dropoff_by_resource.plot(kind='bar', legend=False, title="Dropoffs by resource")
dropoff_resource_ax.set(ylabel='# Dropoffs', xlabel='Resource')

save_or_display("Dropoff by resource")

In [None]:
print(section_series.values)
print(section_series.index)
sns.regplot([i for i in range(len(section_series.index))], section_series.values)