In [None]:
import dill

In [None]:
dill.load_session('user_and_course_dfs.db')

In [None]:
from utilities import *

In [None]:
%run other_graphing_utilities.ipynb

In [None]:
def get_transition_counts(data, keep_reloads=False, session_threshold=TWO_HOURS):
    occurences = get_durations(data, session_threshold=session_threshold).groupby(['from', 'to'])

    counts = occurences.size().to_frame(name='count').reset_index()
    
    return result_of_keep_reloads(keep_reloads, counts)

def get_durations(data, session_threshold=TWO_HOURS):
    durations = []

    for u in data:
        for i in range(u.time.count() - 1):
            # For some reason, despite the timezones already being in UTC, they have to be converted
            # again. Not sure if a bug or something is modifying them between assignment and here
            time_diff = (u.time.iloc[i+1].tz_convert('UTC') - u.time.iloc[i].tz_convert('UTC')).total_seconds()
            if time_diff > session_threshold: time_diff = 0
            
            durations.append((u.display_name.iloc[i], u.display_name.iloc[i+1], time_diff))

    return pd.DataFrame(durations, columns=['from', 'to', 'duration'])

def result_of_keep_reloads(keep_reloads, transitions):
    if keep_reloads: return transitions
    
    if len(transitions): return transitions[transitions['from'] != transitions['to']]
    
    else: return transitions

def analyze_knowledge_types(data, avg_duration_threshold=DURATION_AVERAGE_THRESHOLD):
    transitions_from_to = {ktype: {kt: 0 for kt in KTYPES} for ktype in KTYPES}

    categories_avg_duration_over_threshold = {kt: 0 for kt in KTYPES}
    resources_avg_duration_over_threshold = []

    total_of_each_category = {kt: 0 for kt in KTYPES}

    transition_counts = get_transition_counts(data)

    for resource, transitions in transition_counts.groupby("from"):
        types = get_knowledge_types_used_single(resource, resource_categories)
        categories = get_knowledge_type_categories(types)

        duration_avg = int(duration_avgs[resource])

        over_avg_duration_threshold = False

        if duration_avg > avg_duration_threshold:
            resources_avg_duration_over_threshold.append(resource)
            over_avg_duration_threshold = True

        for c in categories:
            if over_avg_duration_threshold:
                categories_avg_duration_over_threshold[c] += 1

        for _, transition in transitions.iterrows():
            to_types = get_knowledge_types_used_single(transition.to, resource_categories)
            to_categories = get_knowledge_type_categories(to_types)

            from_index = get_resource_index(resource)
            to_index = get_resource_index(transition.to)

            for from_c in categories:
                for to_c in to_categories:
                    total_of_each_category[from_c] += 1
                    total_of_each_category[to_c] += 1
                    transitions_from_to[from_c][to_c] += 1

    percents = []
    all_total = sum(total_of_each_category.values())
    percents.append(["{:0.2f}".format(v/all_total) for v in total_of_each_category.values()])
    
    for key, value in transitions_from_to.items():
        inner_percents = []
        total = sum(value.values())

        for k, v in value.items():
            inner_percents.append("{:0.2f}".format(v/total))

        percents.append(inner_percents)
    
    columns = list(transitions_from_to.keys())
    percent_df = pd.DataFrame(data=percents, 
                              index=['All'] + columns,
                              columns=columns)
    
    return transitions_from_to, percent_df, \
           categories_avg_duration_over_threshold, \
           resources_avg_duration_over_threshold

def get_resource_index(resource, row=None, from_to=None):
    try:
        if row is not None:
            if not from_to:
                print("You must specify either the 'from' or 'to' column")
                return

            return resource_order.index(row[from_to])
        elif resource:
            return resource_order.index(resource)
        else:
            print("You must specify either a resource or a row")
            return
        
    except ValueError as e:
        print(e)
        return np.nan

def get_resource_duration_sums(d_df):
    all_resource_time = d_df.groupby(['from'])
    
    return all_resource_time.duration.sum().sort_values(ascending=False)    

def get_resource_duration_avgs(d_df):
    all_resource_time = d_df.groupby(['from'])
    return all_resource_time.duration.mean().sort_values(ascending=False)

In [None]:
resource_durations = get_resource_duration_sums(get_durations(user_urls_dfs))
resource_counts = pd.concat(user_urls_dfs).display_name.value_counts()
resource_duration_avgs = get_resource_duration_avgs(get_durations(user_urls_dfs))

In [None]:
transition_counts = get_transition_counts(user_urls_dfs)
display(transition_counts)

In [None]:
transition_counts['from_index'] = transition_counts.apply(
    lambda x: get_resource_index(None, row=x, from_to=FROM), 
    axis=1)

transition_counts['to_index'] = transition_counts.apply(
    lambda x: get_resource_index(None, row=x, from_to=TO), 
    axis=1)


transition_counts = transition_counts.dropna()
transition_counts = transition_counts[transition_counts['from_index'] > transition_counts['to_index']]
transition_counts = transition_counts.sort_values('count')
transition_counts = transition_counts[transition_counts['count'] > 1]

transitions_median = transition_counts['count'].median()
transitions_mean = transition_counts['count'].mean()

print(transitions_median)
print(transitions_mean)
display(transition_counts)

In [None]:
total_resource_duration = resource_durations.sum()

ratios = [d/total_resource_duration for d in resource_durations]
scale = 1/max(ratios)

scaled_medians = []

for display_name, ratio in zip(resource_durations.index, ratios):
    scaled_medians.append((display_name, float(transitions_median*ratio*scale)))
    
scaled_medians_df = pd.DataFrame(scaled_medians, columns=['resource', 'scaled median']).set_index('resource')

In [None]:
display(scaled_medians_df)
scaled_medians_df.to_csv(path_or_buf='tmp/scaled_medians.csv', )

In [None]:
#medians = dict(scaled_medians)
duration_avgs = dict(resource_duration_avgs)

ai only

In [None]:
transitions, percents, categories_over, resources_over = analyze_knowledge_types(user_urls_dfs)
display(percents)

In [None]:
repeats = collections.defaultdict(int)
num_per_type = collections.defaultdict(int)

for i in range(len(resource_order) - 1):
    r1 = resource_order[i]
    r2 = resource_order[i+1]

    r1_ktypes = get_knowledge_type_categories(get_knowledge_types_used_single(r1, resource_categories))
    r2_ktypes = get_knowledge_type_categories(get_knowledge_types_used_single(r2, resource_categories))

    for t in r1_ktypes:
        num_per_type[t] += 1
        if t in r2_ktypes:
            repeats[t] += 1

display(repeats)
display(num_per_type)

ratios = {}

for key in repeats:
    ratios[key] = repeats[key] / num_per_type[key]
    print(repeats[key], num_per_type[key])
    
pd.DataFrame(ratios, index=[0])

ai+edx

In [None]:
resource_durations = get_resource_duration_sums(get_durations(user_urls_dfs))
resource_counts = pd.concat(user_urls_dfs).display_name.value_counts()
resource_duration_avgs = get_resource_duration_avgs(get_durations(user_urls_dfs))
duration_avgs = dict(resource_duration_avgs)
transitions, percents, categories_over, resources_over = analyze_knowledge_types(user_urls_dfs)
percents = percents.reindex(columns=['factual', 'conceptual', 'procedural']) \
                   .reindex(index=['All', 'factual', 'conceptual', 'procedural'])
percents