<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Choose-a-Topic" data-toc-modified-id="Choose-a-Topic-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Choose a Topic</a></span></li><li><span><a href="#Build-a-Lexicon" data-toc-modified-id="Build-a-Lexicon-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build a Lexicon</a></span></li><li><span><a href="#Find-A-Set-of-Videos-To-Analyze" data-toc-modified-id="Find-A-Set-of-Videos-To-Analyze-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Find A Set of Videos To Analyze</a></span></li><li><span><a href="#Plot-Timelines" data-toc-modified-id="Plot-Timelines-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Plot Timelines</a></span></li></ul></div>

In [None]:
from esper.prelude import *
from esper.widget import *
from esper.topics import *
from esper.spark_util import *

from esper.plot_timeline import VideoRow, VideoSegment, plot_video_timelines
from datetime import timedelta
from collections import defaultdict, Counter, OrderedDict
import _pickle as pickle

# Choose a Topic

In [None]:
topic = 'abortion'

# Build a Lexicon

In [None]:
lexicon = mutual_info(topic)
lexicon

# Find A Set of Videos To Analyze

In [None]:
merged_segments = find_segments(lexicon, window_size=500, threshold=100, merge_overlaps=True)
with open('/tmp/topic-{}.pkl'.format(topic), 'wb') as f:
    pickle.dump(merged_segments, f)

In [None]:
with open('/tmp/topic-{}.pkl'.format(topic), 'rb') as f:
    merged_segments = pickle.load(f)

Examine the top videos for the topic.

In [None]:
top_10_videos = sorted(get_topic_time_by_video(merged_segments).items(), key=lambda x: -x[1].total_seconds())[:10]
top_10_video_ids = { k[0] for k, _ in top_10_videos }
top_10_sub_paths = { k[1] for k, _ in top_10_videos }

In [None]:
show_segments(filter(lambda x: x[0] in top_10_video_ids, merged_segments))

# Plot Timelines

In [None]:
related_topics = ['supreme court', 'gay marriage', 'obamacare']
topic_to_lexicon = { t : mutual_info(t) for t in related_topics }
topic_to_lexicon[topic] = lexicon

Build the plots below.

In [None]:
video_ids = list(top_10_video_ids)

In [None]:
# Get the face genders and commercials dataframes
commercials = get_commercials()
face_genders = get_face_genders()

gender_map = { x.id : x.name for x in Gender.objects.all() }

# Exact mentions
video_id_to_mentions = caption_search([topic.upper()])[0]

video_id_to_face_genders = defaultdict(list)
for face_gender in face_genders.where(
    (face_genders.video_id.isin(video_ids)) &
    (face_genders.host_probability < 0.8) &
    (face_genders.probability > 0.95)
).select('video_id', 'gender_id', 'min_frame', 'max_frame').collect():
    video_id_to_face_genders[
        (face_gender['video_id'], gender_map[face_gender['gender_id']])
    ].append(
        (face_gender['min_frame'], face_gender['max_frame'])
    )

video_id_to_commercials = defaultdict(list)
for commercial in commercials.where(
    commercials.video_id.isin(video_ids)
).select('video_id', 'min_frame', 'max_frame').collect():
    video_id_to_commercials[
        commercial['video_id']
    ].append((commercial['min_frame'], commercial['max_frame']))

In [None]:
video_id_and_topics_to_segments = defaultdict(list)
for t, lex in topic_to_lexicon.items(): 
    for segment in find_segments(lex, window_size=500, threshold=10, 
                                 merge_overlaps=False, docs=list(top_10_sub_paths)):
        video_id, _, interval, score, _ = segment
        video_id_and_topics_to_segments[(video_id, t)].append(segment)

In [None]:
def plot_helper(videos, main_topic, threshold=50, show_legend=True):

    def unpack_segments(segment_list):
        return [(interval, val) for _, _, interval, val, _ in segment_list]

    rows = []
    for video in videos:
        vid_segments = []

        # Topic Segments
        for (a, b), val in unpack_segments(video_id_and_topics_to_segments[(video.id, main_topic)]):
            vid_segments.append(VideoSegment(
                start_time=timedelta(seconds=a),
                end_time=timedelta(seconds=b),
                display_label='non-commercial',
                display_value=min(1., val / 250.)
            ))

        # Commerical segments
        vid_segments.extend([
            VideoSegment(
                start_time=timedelta(seconds=a / video.fps),
                end_time=timedelta(seconds=b / video.fps),
                display_label='commercial',
                display_value=1.
            ) for a, b in video_id_to_commercials[video.id]
        ])

        intervals_with_women = [
            (timedelta(seconds=a / video.fps), timedelta(seconds=b / video.fps)) 
            for a, b in video_id_to_face_genders[(video.id, 'F')]
        ]

        intervals_with_men = [
            (timedelta(seconds=a / video.fps), timedelta(seconds=b / video.fps)) 
            for a, b in video_id_to_face_genders[(video.id, 'M')]
        ]
        
        interval_labels = OrderedDict([
            ('woman on screen (excl. hosts)', intervals_with_women),
            ('man on screen (excl. hosts)', intervals_with_men)
        ])
        for t in topic_to_lexicon:
            interval_labels['{} score >= {}'.format(t, threshold)] = [
                (timedelta(seconds=a), timedelta(seconds=b)) 
                for _, _, (a, b), val, _ in 
                video_id_and_topics_to_segments[(video.id, t)] if val >= threshold
            ]

        row = VideoRow(
            video,
            segments=vid_segments,
            # Draw some intervals on all of the videos
            interval_labels=interval_labels,
            discrete_labels={
                '{} mentioned'.format(topic): [
                    timedelta(seconds=(a + b) / 2) for a, b in video_id_to_mentions.get(video.id, [])
                ]
            }
        )
        rows.append(row)

    interval_color_map = {
        'woman on screen (excl. hosts)': 'Orange',
        'man on screen (excl. hosts)': 'Blue'
    }
    interval_colors = ['Red', 'Violet', 'Green', 'Pink', 'Cyan']
    for i, t in enumerate(topic_to_lexicon):
        interval_color_map['{} score >= {}'.format(t, threshold)] = interval_colors[i]
    
    plot_video_timelines(
        rows,
        interval_label_color_map=interval_color_map,
        discrete_label_shape_map={
            '{} mentioned'.format(topic): 'o'
        },
        show_legend=show_legend,
        max_length=timedelta(seconds=3600 * 2),
        min_y_margin=1500
    )

videos = list(Video.objects.filter(id__in=video_ids))
increment = 5
for i in range(0, len(videos), increment):
    plot_helper(videos[i:i+increment], topic, show_legend=i==0)

In [None]:
def lexicon_jaccard_sim(l1, l2):
    l1_words = set([x[0] for x in l1])
    l2_words = set([x[0] for x in l2])
    return len(l1_words & l2_words) / len(l1_words | l2_words)

for t1, l1 in topic_to_lexicon.items():
    for t2, l2 in topic_to_lexicon.items():
        if t1 <= t2:
            continue
        print(t1, ',', t2, ',', lexicon_jaccard_sim(l1, l2))

In [None]:
for t, l in topic_to_lexicon.items():
    print(t)
    print(l[:20])