<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Enter-a-Topic" data-toc-modified-id="Enter-a-Topic-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Enter a Topic</a></span></li><li><span><a href="#Build-a-Lexicon" data-toc-modified-id="Build-a-Lexicon-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build a Lexicon</a></span></li><li><span><a href="#Search-for-Segments" data-toc-modified-id="Search-for-Segments-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Search for Segments</a></span></li><li><span><a href="#Visualize-Video-Timelines" data-toc-modified-id="Visualize-Video-Timelines-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Visualize Video Timelines</a></span></li><li><span><a href="#Validation" data-toc-modified-id="Validation-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Validation</a></span><ul class="toc-item"><li><span><a href="#Assert-No-Double-Counting" data-toc-modified-id="Assert-No-Double-Counting-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Assert No Double Counting</a></span></li><li><span><a href="#Sensitivity-of-Total-Segment-Length-to-Window-Size" data-toc-modified-id="Sensitivity-of-Total-Segment-Length-to-Window-Size-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Sensitivity of Total Segment Length to Window Size</a></span></li><li><span><a href="#Sensitivity-of-Total-Segment-Length-to-Threshold" data-toc-modified-id="Sensitivity-of-Total-Segment-Length-to-Threshold-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Sensitivity of Total Segment Length to Threshold</a></span></li><li><span><a href="#Overlap-Between-Topics" data-toc-modified-id="Overlap-Between-Topics-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Overlap Between Topics</a></span></li></ul></li><li><span><a href="#Analysis" data-toc-modified-id="Analysis-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Analysis</a></span><ul class="toc-item"><li><span><a href="#Topic-by-Show" data-toc-modified-id="Topic-by-Show-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Topic by Show</a></span><ul class="toc-item"><li><span><a href="#Topic-by-Show-By-Year" data-toc-modified-id="Topic-by-Show-By-Year-6.1.1"><span class="toc-item-num">6.1.1&nbsp;&nbsp;</span>Topic by Show By Year</a></span></li><li><span><a href="#Topic-by-Show-By-Quarter" data-toc-modified-id="Topic-by-Show-By-Quarter-6.1.2"><span class="toc-item-num">6.1.2&nbsp;&nbsp;</span>Topic by Show By Quarter</a></span></li></ul></li><li><span><a href="#Multitopic-Comparison" data-toc-modified-id="Multitopic-Comparison-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Multitopic Comparison</a></span></li></ul></li></ul></div>

In [None]:
from esper.prelude import *
from esper.widget import *
from esper.topics import *
from esper.spark_util import *

from esper.timeline_plot import VideoRow, VideoSegment, plot_video_timelines
from datetime import timedelta
from collections import defaultdict, Counter, OrderedDict
import _pickle as pickle

# Enter a Topic

In [None]:
topic = 'syria'

# Build a Lexicon

In [None]:
lexicon = mutual_info(topic)
lexicon

# Search for Segments

In [None]:
segments = find_segments(lexicon, window_size=500, threshold=10., merge_overlaps=False)
with open('/tmp/topic-{}.pkl'.format(topic), 'wb') as f:
    pickle.dump(segments, f)

In [None]:
show_segments([x for x in segments if x[3] > 50])

In [None]:
with open('/tmp/topic-{}.pkl'.format(topic), 'rb') as f:
    segments = pickle.load(f)

# Visualize Video Timelines

In [None]:
threshold = 50

# Exact mentions
video_id_to_mentions = caption_search([topic.upper()])[0]

# Get videos with most topic time
video_id_to_segments = defaultdict(list)
video_id_to_total_segment_time = Counter()
for segment in segments:
    video_id, _, interval, score, _ = segment
    video_id_to_segments[video_id].append(segment)
    if score >= threshold:
        video_id_to_total_segment_time[video_id] += interval[1] - interval[0]

# Get the face genders and commercials dataframes
commercials = get_commercials()
face_genders = get_face_genders()

gender_map = { x.id : x.name for x in Gender.objects.all() }

        
def plot_helper(video_ids):
    video_id_to_face_genders = defaultdict(list)
    for face_gender in face_genders.where(
        (face_genders.video_id.isin(video_ids)) &
        (face_genders.host_probability < 0.8) &
        (face_genders.probability > 0.95)
    ).select('video_id', 'gender_id', 'min_frame', 'max_frame').collect():
        video_id_to_face_genders[
            (face_gender['video_id'], gender_map[face_gender['gender_id']])
        ].append(
            (face_gender['min_frame'], face_gender['max_frame'])
        )

    video_id_to_commercials = defaultdict(list)
    for commercial in commercials.where(
        commercials.video_id.isin(video_ids)
    ).select('video_id', 'min_frame', 'max_frame').collect():
        video_id_to_commercials[
            commercial['video_id']
        ].append((commercial['min_frame'], commercial['max_frame']))

    def unpack_segments(segment_list):
        return [(interval, val) for _, _, interval, val, _ in segment_list]

    rows = []
    for video in Video.objects.filter(id__in=video_ids):
        vid_segments = []

        # Topic Segments
        for (a, b), val in unpack_segments(video_id_to_segments[video.id]):
            vid_segments.append(VideoSegment(
                start_time=timedelta(seconds=a),
                end_time=timedelta(seconds=b),
                display_label='non-commercial',
                display_value=min(1., val / 250.)
            ))

        # Commerical segments
        vid_segments.extend([
            VideoSegment(
                start_time=timedelta(seconds=a / video.fps),
                end_time=timedelta(seconds=b / video.fps),
                display_label='commercial',
                display_value=1.
            ) for a, b in video_id_to_commercials[video.id]
        ])

        intervals_with_women = [
            (timedelta(seconds=a / video.fps), timedelta(seconds=b / video.fps)) 
            for a, b in video_id_to_face_genders[(video.id, 'F')]
        ]

        intervals_with_men = [
            (timedelta(seconds=a / video.fps), timedelta(seconds=b / video.fps)) 
            for a, b in video_id_to_face_genders[(video.id, 'M')]
        ]

        row = VideoRow(
            video,
            segments=vid_segments,
            # Draw some intervals on all of the videos
            interval_labels=OrderedDict([
                ('{} score >= {}'.format(topic, threshold), [
                    (timedelta(seconds=a), timedelta(seconds=b)) 
                    for _, _, (a, b), val, _ in video_id_to_segments[video.id] if val >= threshold
                ]),
                ('{} score >= {}'.format(topic, 2 * threshold), [
                    (timedelta(seconds=a), timedelta(seconds=b)) 
                    for _, _, (a, b), val, _ in video_id_to_segments[video.id] if val >= 2 * threshold
                ]),
                ('woman on screen (excl. hosts)', intervals_with_women),
                ('man on screen (excl. hosts)', intervals_with_men)
            ]),
            discrete_labels={
                '{} mentioned'.format(topic): [
                    timedelta(seconds=(a + b) / 2) for a, b in video_id_to_mentions.get(video.id, [])
                ]
            }
        )
        rows.append(row)

    plot_video_timelines(
        rows,
        interval_label_color_map={
            '{} score >= {}'.format(topic, threshold): 'Red',
            '{} score >= {}'.format(topic, 2 * threshold): 'DarkRed',
            'woman on screen (excl. hosts)': 'Orange',
            'man on screen (excl. hosts)': 'Blue'
        },
        discrete_label_shape_map={
            '{} mentioned'.format(topic): 'o'
        },
        max_length=timedelta(seconds=3600 * 3)
    )

sorted_ids = sorted(video_id_to_segments.keys(), 
                    key=lambda x: -video_id_to_total_segment_time[x])
num_buckets = 20
num_videos = 5
    
for i in range(num_buckets):
    start_idx = i * int(len(sorted_ids) / num_buckets)
    video_ids = sorted_ids[start_idx:start_idx + num_videos]
    print('{}th percentile of {} time [{}, {}]'.format(
        int(100 - (i * 100 / num_buckets)), 
        topic, start_idx, start_idx + num_videos)
    )
    plot_helper(video_ids)

# Validation

In [None]:
print('Coverage of "{}": {:0.2f} hrs'.format(topic, get_total_segment_length(segments).total_seconds() / 60 / 60))

## Assert No Double Counting
This might happen if we have more than one transcript file loaded for each video.

In [None]:
check_for_double_counting(segments)

## Sensitivity of Total Segment Length to Window Size

We are interested in the stability of the total segment runtime when window size is varied. A low variation indicates that the algorithm is not sensitive to the choice of the window size parameter.

In [None]:
plot_total_segment_length_vs_window_size(
    lexicon,
    window_sizes=[10, 50, 100, 250, 500, 1000]
)

## Sensitivity of Total Segment Length to Threshold

We are interested in the stability of the total segment runtime when the threshold is varied. A low variation indicates that the algorithm is not sensitive to the choice of the threshold parameter.

In [None]:
plot_total_segment_length_vs_threshold(
    lexicon, 
    thresholds=[5, 10, 25, 50, 75, 100, 200]
)

## Overlap Between Topics

Some topics are subtopics of another topic. For instance, we expect "affordable care act" to be a subtopic of "healthcare". This section prints out the segment overlap between topics.

In [None]:
related_topics = ['isis', 'terrorism', 'middle east', 'islam']
unrelated_topics = ['baseball', 'healthcare', 'taxes']

In [None]:
topics = [topic] + related_topics + unrelated_topics
assert len(topics) > 1
topic_overlap = get_overlap_between_topics(
    [topic] + related_topics + unrelated_topics, 
    window_size=250
)
topic_overlap

# Analysis

## Topic by Show

In [None]:
topic_time_by_show = get_topic_time_by_show(segments)
plot_topic_time_by_show(topic, topic_time_by_show)

### Topic by Show By Year

In [None]:
plot_topic_by_show_over_time(topic, segments)

### Topic by Show By Quarter

In [None]:
plot_topic_by_show_over_time(topic, segments, quarters=True)

## Multitopic Comparison

In [None]:
topics_to_compare = ['healthcare', 'election', 'email', 'immigration']

In [None]:
topics = [topic] + topics_to_compare
assert len(topics) > 1

def plot_topic_comparison_by_show(topics, window_size=250, threshold=50):
    topic_times_by_show = []
    for topic in topics:
        lexicon = mutual_info(topic)
        segments = find_segments(lexicon, window_size=window_size, threshold=threshold)
        topic_times_by_show.append(get_topic_time_by_show(segments))
    plot_topic_time_by_show(topics, topic_times_by_show)
    
plot_topic_comparison_by_show(topics)

In [None]:
def plot_topic_comparison_by_show(topics, years=range(2015, 2018), 
                                  window_size=100, threshold=33):
    segments_by_show = []
    for topic in topics:
        lexicon = mutual_info(topic)
        segments = find_segments(lexicon, window_size=window_size, threshold=threshold)
        segments_by_show.append(segments)
        
    print('All coverage')
    plot_topic_time_by_show(
        topics, 
        [get_topic_time_by_show(segs) for segs in segments_by_show],
        normalize_by_total_runtime=True
    )
    
    if years is not None:
        for year in years:
            print('Coverage in {}'.format(year))
            plot_topic_time_by_show(
                topics,
                [
                    get_topic_time_by_show(
                        segs,
                        date_range=['{}-01-01'.format(year), '{}-01-01'.format(year + 1)]
                    ) for segs in segments_by_show
                ],
                normalize_by_total_runtime=False
            )

topics = [topic] + topics_to_compare
assert len(topics) > 1
plot_topic_comparison_by_show(topics)

In [None]:
video_id_to_face_genders = defaultdict(list)
for face_gender in FaceGender.objects.filter(
             gender__name__in=['M', 'F'],
             face__shot__video__id__in=top_ids,
             probability__gt=0.95
         ).values('face__shot__video__id', 'gender__name', 'face__shot__min_frame', 'face__shot__max_frame'):
    video_id_to_face_genders[(face_gender['face__shot__video__id'], face_gender['gender__name'])].append(
        (face_gender['face__shot__min_frame'], face_gender['face__shot__max_frame'])
    )

video_id_to_commercials = defaultdict(list)
for commercial in Commercial.objects.filter(video__id__in=top_ids).values('video__id', 'min_frame', 'max_frame'):
    video_id_to_commercials[commercial['video__id']].append((commercial['min_frame'], commercial['max_frame']))