In [None]:
import dill

In [None]:
dill.load_session('user_and_course_dfs.db')

In [None]:
from utilities import *
from itertools import combinations 
from Levenshtein import distance

In [None]:
%run heatmap_utilities.ipynb
%run other_graphing_utilities.ipynb

In [None]:
c.to_file = False

In [None]:
def get_levenshtein_dist_resources(u1, u2):
    if not hasattr(get_levenshtein_dist_resources, 'similarities'):
        get_levenshtein_dist_resources.similarities = collections.defaultdict(list)
    
    assert (u1.user.iloc[0] != u2.user.iloc[0])

    resources_used = u1.display_name.append(u2.display_name).unique()
    resource_ids = {r: i for r, i in zip(resources_used, range(len(resources_used)))}
    
    resources_u1 = "".join([str(resource_ids[r]) for r in u1.display_name.unique()])
    resources_u2 = "".join([str(resource_ids[r]) for r in u2.display_name.unique()])

    if distance(resources_u1, resources_u2) == 0:
        # Store which users are similar
        key1 = (u1.user.iloc[0], len(resources_u1))
        key2 = (u2.user.iloc[0], len(resources_u2))
        
        if key2 not in get_levenshtein_dist_resources.similarities[key1]:
            get_levenshtein_dist_resources.similarities[key1].append(key2)

    return distance(resources_u1, resources_u2)


def get_levenshtein_distances(data, lev_func=get_levenshtein_dist_resources):
    users = list(pd.concat(data).user.unique())
        
    rng = [i for i in range(0, len(users))]
    pairs = set(combinations(rng, 2))
    pairs = [(i, j) for i, j in pairs if i != j]
    
    func = lambda d, i, j: lev_func(d[i], d[j])
                       
    return get_2d_vals(data, users, func, pairs)


def get_levenshtein_dist_types(user1, user2):
    resources_used = user1.display_name.append(user2.display_name).unique()
    
    types_used = get_knowledge_types_used(resources_used, resource_categories)
    type_ids = {tuple(t): i for t, i in zip(types_used, range(len(types_used)))}
    
    type_strs = []
    
    for u in [user1, user2]:
        u_types = get_knowledge_types_used(u.display_name.unique(), resource_categories)
        u_types_str = "".join([str(type_ids[tuple(t)]) for t in u_types])
        
        type_strs.append(u_types_str)
    
    return distance(type_strs[0], type_strs[1])


def get_levenshtein_distances_types(data):
    return get_levenshtein_distances(data, lev_func=get_levenshtein_dist_types)

def get_2d_vals(data, display_names, func, loop_pairs):
    vals = np.zeros((len(display_names), len(display_names)))

    for i, j in loop_pairs:
        vals[i][j] += func(data, i, j)
        
    return display_names, vals
def broken_y_bar_histogram_df(flattened, bins):
    # TODO: make this simpler wiithout two calls to np.histogram
    vals, divisions = map(list, np.histogram(flattened, bins=bins))
    divisions = [int(d) for d in divisions]
    vals, divisions = map(list, np.histogram(flattened, bins=divisions))
    
    # for some reason, numpy returns zero in its values for the histogram function...
    i = 0

    while i < len(vals):
        if not vals[i]:
            del vals[i]
            del divisions[i]
            i -= 1

        i += 1
        
    vals.append(0)

    return pd.DataFrame(zip(divisions, vals)).set_index(0)

In [None]:
resource_levenshtein_distances_names, resource_levenshtein_distances = get_levenshtein_distances(user_urls_dfs)
type_levenshtein_distances_names, type_levenshtein_distances = get_levenshtein_distances_types(user_urls_dfs)

In [None]:
## Massive Heatmap. Do not accidentally run.

# plot_2d_values_heatmap(user_urls_dfs, func=get_levenshtein_distances, 
#                        xlabel="User A", ylabel="User B", title_name="All Users",
#                        figsize=(400, 400), font_scale=2.5,
#                        unit="Levenshtein Distance", quantile=False,
#                        fig_size_inches=(140, 180), dpi=300, transpose=True)

In [None]:
n_users = 50
plot_2d_values_heatmap(user_urls_dfs[0:n_users], func=get_levenshtein_distances_types, 
                       xlabel="User A", ylabel="User B", title_name="Sample {} Users".format(n_users),
                       figsize=(10, 10), font_scale=1,
                       unit="Levenshtein Distance (Knowledge Type)", quantile=False)

In [None]:
n_users = 50
plot_2d_values_heatmap(user_urls_dfs[0:n_users], func=get_levenshtein_distances, 
                       xlabel="User A", ylabel="User B", title_name="Sample {} Users".format(n_users),
                       figsize=(10, 10), font_scale=1,
                       unit="Levenshtein Distance", quantile=False)

In [None]:
lev_dist_by_type_flat = type_levenshtein_distances[np.triu_indices(len(type_levenshtein_distances), k=1)]
dists_type_df = broken_y_bar_histogram_df(lev_dist_by_type_flat, 70)

plot_broken_y_bar(
    dists_type_df, lims=[(0, 500), (700, 9000)], 
    xlabel="Levenshtein Distance (Type)", ylabel="Occurences", 
    ylabel_loc=(-6, 570), figsize=(10, 10), breakline_len=.01,
    width=1, align='edge'
)

In [None]:
lev_dist_by_resource_flat = resource_levenshtein_distances[np.triu_indices(len(resource_levenshtein_distances), k=1)]
dists_df = broken_y_bar_histogram_df(lev_dist_by_resource_flat, 70)

plot_broken_y_bar(
    dists_df, lims=[(0, 1100), (2000, 7100)], 
    xlabel="Levenshtein Distance (Resource)", ylabel="Occurences", 
    ylabel_loc=(-6, 570), figsize=(10, 5), breakline_len=.01,
    width=1, align='edge'
)

In [None]:
get_levenshtein_distances(user_urls_dfs)

In [None]:
display(get_levenshtein_dist_resources.similarities)

In [None]:
threshold_resources = 0

grouped_user = pd.concat(user_urls_dfs, sort=False).groupby(by='user')

max_all_indices = 0

for user, similarities in get_levenshtein_dist_resources.similarities.items():
    if user[1] >= threshold_resources:
        print("user: " + user[0], "\nresources:")
        for k, v in grouped_user:
            if k == user[0]:
                max_index = 0
                
                for r in v.display_name.unique():
                    try:
                        index = resource_order.index(r)
                        print(r + ", index: " + str(index))
                        max_index = max(max_index, index)
                        
                    except ValueError:
                        print(r + ", index not found")
                        
                print("Max Index: " + str(max_index))
                
                max_all_indices = max(max_all_indices, max_index)
                
        print("\nsimilarities: ")
        
        for s in similarities:
            for k, v in grouped_user:
                if k == s[0]:
                    print(k)
                    
        print('\n--------------------------------------------------\n')
    
print("Max of all indices = " + str(max_all_indices))
    

In [None]:
resource_levenshtein_distances_names, resource_levenshtein_distances = get_levenshtein_distances(ai_edx_dfs)
type_levenshtein_distances_names, type_levenshtein_distances = get_levenshtein_distances_types(ai_edx_dfs)

In [None]:
lev_dist_by_resource_flat = resource_levenshtein_distances[np.triu_indices(len(resource_levenshtein_distances), k=1)]
dists_df = broken_y_bar_histogram_df(lev_dist_by_resource_flat, 70)

plot_broken_y_bar(
    dists_df, lims=[(0, 1100), (2000, 7100)], 
    xlabel="Levenshtein Distance (Resource)", ylabel="Occurences", 
    ylabel_loc=(-6, 570), figsize=(10, 5), breakline_len=.01,
    width=1, align='edge'
)