In [None]:
import dill
dill.load_session('user_and_course_dfs.db')

In [None]:
from utilities import *

In [None]:
%run other_graphing_utilities.ipynb

In [None]:
def get_durations(data, session_threshold=TWO_HOURS):
    durations = []

    for u in data:
        for i in range(u.time.count() - 1):
            # For some reason, despite the timezones already being in UTC, they have to be converted
            # again. Not sure if a bug or something is modifying them between assignment and here
            time_diff = (u.time.iloc[i+1].tz_convert('UTC') - u.time.iloc[i].tz_convert('UTC')).total_seconds()
            if time_diff > session_threshold: time_diff = 0
            
            durations.append((u.display_name.iloc[i], u.display_name.iloc[i+1], time_diff))

    return pd.DataFrame(durations, columns=['from', 'to', 'duration'])

def result_of_keep_reloads(keep_reloads, transitions):
    if keep_reloads: return transitions
    
    if len(transitions): return transitions[transitions['from'] != transitions['to']]
    
    else: return transitions
    
def get_resource_index(resource, row=None, from_to=None):
    try:
        if row is not None:
            if not from_to:
                print("You must specify either the 'from' or 'to' column")
                return

            return resource_order.index(row[from_to])
        elif resource:
            return resource_order.index(resource)
        else:
            print("You must specify either a resource or a row")
            return
        
    except ValueError as e:
        print(e)
        return np.nan
    
def get_resource_duration_sums(d_df):
    all_resource_time = d_df.groupby(['from'])
    
    return all_resource_time.duration.sum().sort_values(ascending=False)    

def get_resource_duration_avgs(d_df):
    all_resource_time = d_df.groupby(['from'])
    return all_resource_time.duration.mean().sort_values(ascending=False)

In [None]:
resource_durations = get_resource_duration_sums(get_durations(user_urls_dfs))

In [None]:
resource_counts = pd.concat(user_urls_dfs).display_name.value_counts()

In [None]:
resource_duration_avgs = get_resource_duration_avgs(get_durations(user_urls_dfs))

## First Attempt

In [None]:
transition_counts = get_transition_counts(user_urls_dfs)

In [None]:
transition_counts['from_index'] = transition_counts.apply(
    lambda x: get_resource_index(None, row=x, from_to=FROM), 
    axis=1)

transition_counts['to_index'] = transition_counts.apply(
    lambda x: get_resource_index(None, row=x, from_to=TO), 
    axis=1)


transition_counts = transition_counts.dropna()
transition_counts = transition_counts[transition_counts['from_index'] > transition_counts['to_index']]
transition_counts = transition_counts.sort_values('count')
transition_counts = transition_counts[transition_counts['count'] > 1]

transitions_median = transition_counts['count'].median()
transitions_mean = transition_counts['count'].mean()

print(transitions_median)
print(transitions_mean)
display(transition_counts)

In [None]:
total_resource_duration = resource_durations.sum()

ratios = [d/total_resource_duration for d in resource_durations]
scale = 1/max(ratios)

scaled_medians = []

for display_name, ratio in zip(resource_durations.index, ratios):
    scaled_medians.append((display_name, float(transitions_median*ratio*scale)))
    
scaled_medians_df = pd.DataFrame(scaled_medians, columns=['resource', 'scaled median']).set_index('resource')

In [None]:
display(scaled_medians_df)
scaled_medians_df.to_csv(path_or_buf='tmp/scaled_medians.csv', )

In [None]:
medians = dict(scaled_medians)
duration_avgs = dict(resource_duration_avgs)

## Second Attempt

In [None]:
def get_review_duration_avgs(data):
    durations = get_durations(data, session_threshold=TWO_HOURS)
    durations = result_of_keep_reloads(False, durations)
    
    if not durations.size:
        print("No durations")
        return
    display(durations)
    durations['from_index'] = durations.apply(
        lambda x: get_resource_index(None, row=x, from_to=FROM), 
        axis=1)

    durations['to_index'] = durations.apply(
        lambda x: get_resource_index(None, row=x, from_to=TO), 
        axis=1)

    durations = durations.dropna()
    durations = durations[durations['from_index'] > durations['to_index']]
    total_duration_avg = 3*60#np.quantile(durations['duration'], .91)
    durations = durations[durations['duration'] > total_duration_avg].sort_values('duration')
    
    print(total_duration_avg/60)
    
    if not durations.size:
        print("No valid reviews")
        return
    
    duration_avgs = get_transition_duration_avgs(durations, keep_reloads=False) \
                    .sort_values('duration average').sort_values(['from', 'to'])
    
    
    occurences = durations.groupby(['from', 'to']).size() \
                 .to_frame(name='count').reset_index().sort_values(['from', 'to'])

    duration_avgs['count'] = occurences['count']
    duration_avgs = duration_avgs.sort_values('count')
    duration_avgs = duration_avgs[duration_avgs['count'] > 1]

    if not duration_avgs.size:
        print("Transitions must occur more than once")
        return
    
    duration_avgs = duration_avgs[(duration_avgs['from'] != 'Server') & \
                                  (duration_avgs['to'] != 'Server')]

    duration_avgs['from_ktype'] = duration_avgs.apply(
        lambda x: get_knowledge_types_used_single(x['from'], resource_categories),
        axis=1
    )

    duration_avgs['to_ktype'] = duration_avgs.apply(
        lambda x: get_knowledge_types_used_single(x['to'], resource_categories),
        axis=1
    )


    duration_avgs['from_index'] = duration_avgs.apply(
        lambda x: get_resource_index(None, row=x, from_to=FROM), 
        axis=1)

    duration_avgs['to_index'] = duration_avgs.apply(
        lambda x: get_resource_index(None, row=x, from_to=TO), 
        axis=1)
    
    return duration_avgs


def get_transition_duration_avgs(d_df, keep_reloads=False):
    transitions = d_df.groupby(['from', 'to'])
    avgs = transitions.duration.mean().to_frame(name='duration average').reset_index()

    return result_of_keep_reloads(keep_reloads, avgs)


In [None]:
reviews = get_review_duration_avgs(user_urls_dfs)
reviews['origin'] = reviews['from']
reviews['destination'] = reviews['to']
reviews = reviews.drop(['duration average', 'from_ktype', 
                        'to_ktype', 'from_index', 
                        'to_index', 'from', 'to'], axis=1)

display(reviews)

In [None]:
pd.set_option('display.max_colwidth', -1)
display(reviews)

In [None]:
reviews = get_review_duration_avgs(ai_edx_dfs)
display(reviews)