In [None]:
import dill
from utilities import *
import textstat

In [None]:
dill.load_session('user_and_course_dfs.db')

In [None]:
%run other_graphing_utilities.ipynb

In [None]:
def filter_video_df(df):
    df.video_id = df.video_id.str.replace('video_', '')
    df.section = df.section.str.replace('\d+-', '')

    bracket_regex = re.compile(r'\[.*\],*\s*')
    output_regex = re.compile(r'>>\s*')

    all_missed_data = []
    all_scores = []
    
    for index, row in df.iterrows():
        if not isinstance(row.speech_times, list): 
            all_missed_data.append([])
            all_scores.append([])
            continue

        missed_data = []
        filtered_lines = []
        filtered_speech_times = []
        
        for i in range(len(row.transcript_en)):
            line = row.transcript_en[i]

            brackets = re.findall(bracket_regex, line)
            outputs = re.findall(output_regex, line)
            missed_data.extend(brackets)
            missed_data.extend(outputs)

            for exclude in brackets + outputs:
                line = line.replace(exclude, '', 1)

            if line: 
                filtered_lines.append(line)
                filtered_speech_times.append(row.speech_times[i])

        scores = [textstat.dale_chall_readability_score(l) for l in filtered_lines]

        assert len(filtered_lines) == len(filtered_speech_times) == len(scores)

        row.transcript_en = filtered_lines
        row.speech_times = filtered_speech_times

        all_scores.append(scores)
        all_missed_data.append(missed_data)

    assert len(all_missed_data) == len(all_scores) == df.shape[0]

    df['missed_data'] = all_missed_data
    df['readability_scores'] = all_scores
    
def get_video_info(video, sections_viewed):
    video_row = videos_with_transcript[videos_with_transcript.video_id == video]
    if not video_row.size: return
    
    speech_times = video_row.iloc[0].speech_times
    graph_sections = list(np.array(speech_times).flatten())

    for s in sections_viewed:
        graph_sections.extend(s[0:2])

    graph_sections.append(0)
    graph_sections = list(set(graph_sections))
    graph_sections.sort()

    views = np.zeros(len(graph_sections))
    stops = np.zeros(len(graph_sections))
    seek_forwards = np.zeros(len(graph_sections))
    seek_backwards = np.zeros(len(graph_sections))
    pauses = np.zeros(len(graph_sections))
    
    for section in sections_viewed:
        # No bar starting at the last graph edge
        for i in range(len(graph_sections) - 1):
            if section[0] <= graph_sections[i] < section[1]:
                views[i] += 1
    
    for section in sections_viewed:
        end = section[1]
        
        i = graph_sections.index(end)
        
        assert(i != 0)
        
        if section[2] == 'stop':
            stops[i] += 1
        elif section[2] == 'seek':
            if section[3] == FORWARD:
                seek_forwards[i] += 1
            elif section[3] == BACKWARD:
                seek_backwards[i] += 1
            else:
                print("Invalid seek event!")
        elif section[2] == 'pause':
            pauses[i] += 1
        else:
            print("Invalid stop event!")
    
    # Scores are guaranteed to be greater than zero
    # because the equation includes + 0.0496 (word/sentences)
    # and every transcript line has at least one word.
    # This is necessary because not every section of the video
    # has a corresponding transcript
    section_scores = [-1 for _ in range(len(graph_sections))]
    
    complexity_scores = video_row['readability_scores'].iloc[0]
    
    for score in complexity_scores:
        assert score > 0

    assert len(complexity_scores) <= len(graph_sections)

    for i in range(len(graph_sections)): 
        for j in range(len(speech_times)):
            if graph_sections[i] >= speech_times[j][0] and graph_sections[i] <= speech_times[j][1]:
                section_scores[i] = complexity_scores[j]
                break
    
    return graph_sections, section_scores, views, \
           video_row.start.iloc[0], stops, seek_forwards, \
           seek_backwards, pauses, video_row.video_duration.iloc[0], \
           video_row.unit.iloc[0]

def get_spikes(spike_threshold):
    all_spikes = []

    for video, sections_viewed in sections_aggregated.items():
        
        video_info = get_video_info(video, sections_viewed)

        try:
            graph_sections, section_scores, views, start, _, _, _, _, end, _ = video_info
        except TypeError:
            continue

        df_len = len(section_scores)

        assert df_len == len(views) == len(graph_sections)
        
        score_views_df = pd.DataFrame(zip(section_scores, views, 
                                          [s for s in graph_sections[:-1]],
                                          [s for s in graph_sections[1:]],
                                          [video] * df_len,
                                          [start] * df_len), 
                                      columns=['score', 'views', 'section_start', 'section_end', 'video', 'start'])

        if end:
            score_views_df['end'] = [(end if end else None) for _ in range(score_views_df.shape[0])]
        
        spikes = score_views_df[score_views_df['views'] > spike_threshold]
    
        # remove sections with no complexity score
        spikes = spikes[spikes['score'] > 0]
        
        all_spikes.append(spikes)

    all_spikes = pd.concat(all_spikes, sort=False)

    elementary_spikes = all_spikes[all_spikes['score'] < ELEMENTARY]
    harder_spikes = all_spikes[all_spikes['score'] >= ELEMENTARY]

    print("elementary: " + str(elementary_spikes.shape[0]), "harder: " + str(harder_spikes.shape[0]))
    return elementary_spikes, harder_spikes    

def get_sections_viewed(data):
    sections_per_user = collections.defaultdict(list)
    sections_aggregated = collections.defaultdict(list)

    invalid_data = []

    for u in data:
        u = u[u.event_type.isin(INTERESTING_EVENTS)]
        sections = collections.defaultdict(list)

        if not u.size:
            continue

        assert u.time.is_monotonic_increasing

        for i in range(0, u.shape[0] - 1):
            r1 = u.iloc[i]
            r2 = u.iloc[i + 1]

            r1_id = get_event_id(r1)

            if r1.event_type == 'play_video' and \
                r2.event_type in STOP_EVENTS and \
                r1_id == get_event_id(r2):

                    start = get_event_dict(r1)['currentTime']

                    try:
                        next_event_dict = get_event_dict(r2)
                        end = next_event_dict['currentTime']

                    except KeyError:
                        end = next_event_dict['old_time']

                    # Some seek_events are invalid
                    if start >= end:
                        invalid_data.append((r1, r2, u.iloc[0].user))
                        continue

                    stop_type = r2.event_type.split('_')[0] 
                    section = [start, end, stop_type]
                    if stop_type == 'seek':
                        if next_event_dict['new_time'] > start:
                            section.append(FORWARD)
                        else:
                            section.append(BACKWARD)

                    sections[r1_id].append(section)
                    sections_aggregated[r1_id].append(section)

        sections_per_user[u.iloc[0].user] = sections
        
    return sections_per_user, sections_aggregated, invalid_data
def graph_complexity(video, sections_viewed):
    video_info = get_video_info(video, sections_viewed)
    
    try:
        graph_sections, section_scores, views, start, _, _, _, _, end, unit = video_info
    except TypeError:
        return
    
    min_score = min(i for i in section_scores if i > 0)
    max_score = max(section_scores)
    norm = plt.Normalize(min_score, max_score)
    CMAP_2.set_under('black')
    colors = CMAP_2(norm(section_scores))
    
    sm = plt.cm.ScalarMappable(cmap=CMAP_2, norm=norm)
    sm.set_array([])

    ticks = np.linspace(min_score, max_score, num=N_TICKS)
    tick_labels = [str(t) for t in ticks]
    
    cbar = plt.colorbar(sm, ticks=ticks)
    cbar.set_ticklabels(tick_labels)
    cbar.ax.set_ylabel('Readability Score')

    df = pd.DataFrame(zip(graph_sections, views), columns=['time', 'views']).set_index('time')
    
    ax = df['views'].plot.bar(color=colors, align='edge', width=1, ec='white', figsize=(40, 5))

    actual_start = min(graph_sections, key=lambda x: abs(x - start))
    start_index = graph_sections.index(actual_start)
    ax.axvline(x=start_index, color='r', linestyle='--')
    
    if end:
        end_index = min(graph_sections, key=lambda x:abs(x-end))
        end_index = graph_sections.index(end_index)
        ax.axvline(x=end_index, color='r', linestyle='--')

    ax.set(xlabel='time', ylabel='# views', title=video + " unit " + unit)
    
    save_or_display('video views for ' + video + ' unit ' + unit)
    
def graph_stop_events(video, sections_viewed):
    video_info = get_video_info(video, sections_viewed)
    
    try:
        graph_sections, section_scores, views, start, stops, \
        seek_forwards, seek_backwards, pauses, end, unit = video_info
    except TypeError:
        return

    df = pd.DataFrame(zip(graph_sections, stops, seek_forwards, seek_backwards, pauses), 
                      columns=['time', 'stops', 'seek_forwards', \
                               'seek_backwards', 'pauses']).set_index('time')
    
    interval = 5
    max_edge = max(graph_sections)
    if max_edge % interval != 0:
        max_edge += interval
        
    bins = np.arange(0, max_edge, interval)
    df = df.groupby(pd.cut(df.index, bins)).sum() 
    ax = df.plot(kind='bar', stacked=True, title=video + ' unit ' + unit, ec='white', figsize=(20, 5))
        
    binned_complexities = []
    for i in range(len(bins) - 1):
        start, end = bins[i], bins[i + 1]
        sections = [i for i in range(len(graph_sections)) if start <= graph_sections[i] < end]
        section_complexities = [section_scores[i] for i in sections]
        if section_complexities:
            max_section_complexity = max(section_complexities)
            binned_complexities.append(max_section_complexity)
        else:
            binned_complexities.append(-1)
    
    min_score = min(i for i in binned_complexities if i > 0)
    max_score = max(binned_complexities)
    norm = plt.Normalize(min_score, max_score)
    CMAP_2.set_under('black')
    colors = CMAP_2(norm(binned_complexities))
    sm = plt.cm.ScalarMappable(cmap=CMAP_2, norm=norm)
    sm.set_array([])

    ticks = np.linspace(min_score, max_score, num=N_TICKS)
    tick_labels = [str(t) for t in ticks]
    
    cbar = plt.colorbar(sm, ticks=ticks)
    cbar.set_ticklabels(tick_labels)
    cbar.ax.set_ylabel('Max Readability Score')
    
    ticklabels = ax.get_xticklabels(which='both')
    for i in range(len(ticklabels)):
        ticklabels[i].set_color(colors[i])
    
    save_or_display('stop events for ' + video + ' unit ' + unit)

def get_event_id(row):
    return get_event_dict(row)['id']

def get_event_dict(row):
    return json.loads(row.event)

def get_all_starts(data):
    all_starts = collections.defaultdict(list)
    for u in data:
        u = u[u.event_type.isin(['play_video', 'load_video'])]

        for i in range(0, u.shape[0] - 1):
            r1 = u.iloc[i]
            r2 = u.iloc[i + 1]
            r1_id = get_event_id(r1)
            if r1.event_type == 'load_video' and \
               r2.event_type == 'play_video' and \
                r1_id == get_event_id(r2):

                start = get_event_dict(r2)['currentTime']
                all_starts[r1_id].append(start)
    
    return all_starts

def get_seeks_info(sections_aggregated):
    all_seek_forwards = 0
    all_seek_backwards = 0
    all_seek_forwards_ls = []
    all_seek_backwards_ls = []
    all_seek_forwards_ratios = []
    all_seek_backwards_ratios = []

    for video, sections_viewed in sections_aggregated.items():
        video_info = get_video_info(video, sections_viewed)
        try:
            _, _, _, start, _, \
            seek_forwards, seek_backwards, _, end, unit = video_info
        except TypeError:
            continue

        summed_forwards = sum(list(seek_forwards))
        summed_backwards = sum(list(seek_backwards))
        summed_total = summed_forwards + summed_backwards

        all_seek_forwards += summed_forwards
        all_seek_backwards += summed_backwards

        all_seek_forwards_ls.append(summed_forwards)
        all_seek_backwards_ls.append(summed_backwards)


        all_seek_forwards_ratios.append(100*(summed_forwards / summed_total) if summed_total else 0)
        all_seek_backwards_ratios.append(100*(summed_backwards / summed_total) if summed_total else 0)

    print(all_seek_forwards / all_seek_backwards)
    print(np.median(all_seek_forwards_ls), np.median(all_seek_backwards_ls))
    print(np.median(all_seek_forwards_ratios), np.median(all_seek_backwards_ratios))
    
    sa_len = 0
    for value in sections_aggregated.values():
        sa_len += len(value)

    invalid_len = len(invalid_data)
    print(invalid_len/(invalid_len + sa_len))
    
    return all_seek_forwards, all_seek_backwards, all_seek_forwards_ls, \
           all_seek_backwards_ls, all_seek_forwards_ratios, all_seek_backwards_ratios
    
def get_difficult_sections(sections_aggregated):
    for video, sections_viewed in sections_aggregated.items():
        video_info = get_video_info(video, sections_viewed)

        try:
            _, section_scores, views, start, _, _, _, _, _,_ = video_info
        except TypeError:
            continue

        score_views_df = pd.DataFrame(zip(section_scores, views, 
                                          [x[0] for x in sections_viewed],
                                          [x[1] for x in sections_viewed],
                                          [video] * len(views),
                                          [start] * len(views)), 
                                      columns=['score', 'views', 'section_start', 
                                               'section_end', 'video', 'start'])

        shouldnt_be_this_hard = score_views_df[9 <= score_views_df['score']]

        if shouldnt_be_this_hard.size:
            print(video, shouldnt_be_this_hard)
            
def graph_dist_seeks(forwards_ratios, backwards_ratios, no_title=False):
    title = "Distribution of forward seeks and backward seeks per video"
    if no_title:
        title = ""
    
    seeks_df = pd.DataFrame([forwards_ratios, backwards_ratios], 
                            index=['forwards', 'backwards']).T
    ax = seeks_df.plot(kind='hist', stacked=True, ec='white')
    ax.set(title=title, xlabel='Percentage of Total Seeks For Each Video')
    save_plot("Distribution of Seeks", graph_type="")
#     save_or_display(title)
    
def plot_spikes(sections_aggregated):
    all_find_spike_views = []

    for video, sections_viewed in sections_aggregated.items():
        video_info = get_video_info(video, sections_viewed)

        try:
            _, _, views, _, _, _, _, _ , _, _ = video_info
        except TypeError:
            continue

        find_spike_views = views[np.where(views > 2)]
        all_find_spike_views.extend(find_spike_views)

    views_hist_df = pd.DataFrame(all_find_spike_views)
    ax = views_hist_df.plot(kind='hist', bins=80, ec='white', figsize=(20, 5))
    ax.set_xlabel('Views')
    
def plot_start_times(all_starts):
    for video, starts in all_starts.items():
        video_row = videos_with_transcript[videos_with_transcript.video_id == video]

        starts = [int(i) for i in starts]
        s = pd.Series(starts).sort_values()

        bins = max(max(starts) - min(starts), 1)

        ax = s.plot(kind='hist', ec='white', title=video, bins=bins)
        ax.set_xlabel('start time')

        start_row = video_row.start
        if start_row.size:
            actual_start = int(start_row.iloc[0])
            start_index = starts.index(actual_start)
            ax.axvline(x=start_index, color='r', linestyle='--')
            actual_start = 0

        save_or_display('start times for students for video ' + video)
        
def plot_dist_of_starts(all_starts):
    dists_from_start = []

    for video, starts in all_starts.items():
        video_row = videos_with_transcript[videos_with_transcript.video_id == video]

        start_row = video_row.start

        if start_row.size:
            actual_start = int(start_row.iloc[0])
            dists = [int(i)-actual_start for i in starts]

            dists_from_start.extend(dists)

    df = pd.DataFrame(dists_from_start)
    print(min(dists_from_start))
    df.plot(kind='hist', ec='white', bins=23)

    save_or_display("Distribution of when users start video relative to the actual start of the video")

In [None]:
videos_df = pd.read_json(VIDEO_JSON, orient='records').T.drop('speech_period', axis=1)
filter_video_df(videos_df)
all_video_ids = list(videos_df.video_id)

In [None]:
videos_with_transcript = videos_df[~videos_df.speech_times.isna()]

In [None]:
sections_per_user, sections_aggregated, invalid_data = get_sections_viewed(user_urls_dfs)

In [None]:
all_starts = get_all_starts(user_urls_dfs)
all_seek_forwards, all_seek_backwards, all_seek_forwards_ls, \
all_seek_backwards_ls, all_seek_forwards_ratios, \
all_seek_backwards_ratios = get_seeks_info(sections_aggregated)

In [None]:
get_difficult_sections(sections_aggregated)

In [None]:
c.to_file = False
for video, sections_viewed in sections_aggregated.items():
    graph_complexity(video, sections_viewed)
    graph_stop_events(video, sections_viewed)

In [None]:

graph_dist_seeks(all_seek_forwards_ratios, all_seek_backwards_ratios, no_title=True)

In [None]:
plot_spikes(sections_aggregated)

In [None]:
plot_start_times(all_starts)

In [None]:
plot_dist_of_starts(all_starts)

In [None]:
sections_per_user, sections_aggregated, invalid_data = get_sections_viewed(ai_edx_dfs)

In [None]:
all_starts = get_all_starts(user_urls_dfs)
all_seek_forwards, all_seek_backwards, all_seek_forwards_ls, \
all_seek_backwards_ls, all_seek_forwards_ratios, \
all_seek_backwards_ratios = get_seeks_info(sections_aggregated)

In [None]:
c.to_file = False
for video, sections_viewed in sections_aggregated.items():
    graph_complexity(video, sections_viewed)
    graph_stop_events(video, sections_viewed)

In [None]:
graph_dist_seeks(all_seek_forwards_ratios, all_seek_backwards_ratios)

In [None]:
plot_spikes(sections_aggregated)

In [None]:
plot_dist_of_starts(all_starts)