<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Videos" data-toc-modified-id="Videos-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Videos</a></span><ul class="toc-item"><li><span><a href="#All-videos" data-toc-modified-id="All-videos-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>All videos</a></span></li><li><span><a href="#Videos-by-channel" data-toc-modified-id="Videos-by-channel-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Videos by channel</a></span></li><li><span><a href="#Videos-by-show" data-toc-modified-id="Videos-by-show-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Videos by show</a></span></li><li><span><a href="#Videos-by-canonical-show" data-toc-modified-id="Videos-by-canonical-show-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Videos by canonical show</a></span></li><li><span><a href="#Videos-by-time-of-day" data-toc-modified-id="Videos-by-time-of-day-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Videos by time of day</a></span></li></ul></li><li><span><a href="#Faces" data-toc-modified-id="Faces-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Faces</a></span><ul class="toc-item"><li><span><a href="#Face-validation" data-toc-modified-id="Face-validation-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Face validation</a></span></li><li><span><a href="#All-faces" data-toc-modified-id="All-faces-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>All faces</a></span></li></ul></li><li><span><a href="#Genders" data-toc-modified-id="Genders-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Genders</a></span><ul class="toc-item"><li><span><a href="#All-genders" data-toc-modified-id="All-genders-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>All genders</a></span></li><li><span><a href="#Gender-by-channel" data-toc-modified-id="Gender-by-channel-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Gender by channel</a></span></li><li><span><a href="#Gender-by-show" data-toc-modified-id="Gender-by-show-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Gender by show</a></span></li><li><span><a href="#Gender-by-canonical-show" data-toc-modified-id="Gender-by-canonical-show-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Gender by canonical show</a></span></li><li><span><a href="#Gender-by-time-of-day" data-toc-modified-id="Gender-by-time-of-day-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Gender by time of day</a></span></li><li><span><a href="#Gender-by-day-of-week" data-toc-modified-id="Gender-by-day-of-week-3.6"><span class="toc-item-num">3.6&nbsp;&nbsp;</span>Gender by day of week</a></span></li><li><span><a href="#Gender-by-topic" data-toc-modified-id="Gender-by-topic-3.7"><span class="toc-item-num">3.7&nbsp;&nbsp;</span>Gender by topic</a></span></li></ul></li></ul></div>

In [None]:
import shutil
import IPython
from esper.validation import *
from esper.spark_util import *
from esper.prelude import *
from esper.widget import *
%matplotlib inline

In [None]:
shows = get_shows()
print('Schema:', shows)
print('Count:', shows.count())

In [None]:
videos = get_videos()
print('Schema:', videos)
print('Count:', videos.count())

In [None]:
faces = get_faces()
print('Schema:', faces)
print('Count:', faces.count())

In [None]:
face_genders = get_face_genders() 
print('Schema:', face_genders)
print('Count:', face_genders.count())

In [None]:
face_identities = get_face_identities()
print('Schema:', face_identities)
print('Count:', face_identities.count())

In [None]:
commercials = get_commercials()
print('Schema:', commercials)
print('Count:', commercials.count())

In [None]:
segments = get_segments()
print('Schema:', segments)
print('Count:', segments.count())

In [None]:
segment_topics = get_segment_topics()
print('Schema:', segment_topics)
print('Count:', segment_topics.count())

In [None]:
# interval_overlap_join(
#     face_genders.where(face_genders.video_id == 5), 
#     segment_topics.where(segment_topics.video_id == 5))

# Videos

In [None]:
def format_time(seconds, padding=4):
    return '{{:0{}d}}:{{:02d}}:{{:02d}}'.format(padding).format(
        int(seconds/3600), int(seconds/60 % 60), int(seconds % 60))

def format_number(n):
    def fmt(n):
        suffixes = {
            6: 'thousand',
            9: 'million',
            12: 'billion',
            15: 'trillion'
        }

        log = math.log10(n)
        suffix = None
        key = None
        for k in sorted(suffixes.keys()):
            if log < k:
                suffix = suffixes[k]
                key = k
                break

        return '{:.2f} {}'.format(n / float(10**(key-3)), suffix)
    if isinstance(n, list):
        return list(map(fmt, n))
    else:
        return fmt(n)

def show_df(table, ordering, clear=True):
    if clear:
        IPython.display.clear_output()
    import qgrid
    return qgrid.show_grid(pd.DataFrame(table)[ordering])
        
def format_hour(h):
    if h <= 12:
        return '{} AM'.format(h)
    else:
        return '{} PM'.format(h-12)

def video_stats(key, labels):
    if key is not None:
        rows = videos.groupBy(key).agg(
            videos[key], 
            func.count('duration'), 
            func.avg('duration'), 
            func.sum('duration'), 
            func.stddev_pop('duration')
        ).collect()
    else:
        rows = videos.agg(
            func.count('duration'), 
            func.avg('duration'), 
            func.sum('duration'), 
            func.stddev_pop('duration')
        ).collect()
    rmap = {(0 if key is None else r[key]): r for r in rows}
    
    return [{
        'label': label['name'],
        'count': rmap[label['id']]['count(duration)'],
        'duration': format_time(int(rmap[label['id']]['sum(duration)'])),
        'avg_duration': '{} (σ = {})'.format(
            format_time(int(rmap[label['id']]['avg(duration)'])),
            format_time(int(rmap[label['id']]['stddev_pop(duration)']), padding=0))
    } for label in labels if not key or label['id'] in rmap]

video_ordering = ['label', 'count', 'duration', 'avg_duration']

hours = [
    r['hour'] for r in 
    Video.objects.annotate(
        hour=Extract('time', 'hour')
    ).distinct('hour').order_by('hour').values('hour')
]

## All videos

In [None]:
show_df(
    video_stats(None, [{'id': 0, 'name': 'whole dataset'}]),
    video_ordering)

## Videos by channel

In [None]:
show_df(
    video_stats('channel_id', list(Channel.objects.all().values('id', 'name'))),
    video_ordering)

## Videos by show

In [None]:
show_df(
    video_stats('show_id', list(Show.objects.all().values('id', 'name'))),
    video_ordering)

## Videos by canonical show

In [None]:
show_df(
    video_stats('canonical_show_id', list(CanonicalShow.objects.all().values('id', 'name'))),
    video_ordering)

## Videos by time of day

In [None]:
show_df(
    video_stats('hour', [{'id': hour, 'name': format_hour(hour)} for hour in hours]),
    video_ordering)


# Faces

## Face validation

In [None]:
base_face_stats = face_validation('All faces', lambda x: x)
big_face_stats = face_validation(
    'Faces height > 0.2', lambda qs: qs.annotate(height=F('bbox_y2') - F('bbox_y1')).filter(height__gte=0.2))

In [None]:
shot_precision = 0.97
shot_recall = 0.97

def face_error_interval(n, face_stats):
    (face_precision, face_recall, _) = face_stats
    return [n * shot_precision * face_precision, n * (2 - shot_recall) * (2 - face_recall)]

## All faces

In [None]:
print('Total faces: {}'.format(
     format_number(face_error_interval(faces.count(), base_face_stats[2]))))

total_duration = videos.agg(func.sum('duration')).collect()[0]['sum(duration)'] - \
    commercials.agg(func.sum('duration')).collect()[0]['sum(duration)']
face_duration = faces.groupBy('frame_id') \
    .agg(
        func.first('duration').alias('duration')
    ).agg(func.sum('duration')).collect()[0]['sum(duration)']
print('% of time a face is on screen: {:0.2f}'.format(100.0 * face_duration / total_duration))

# Genders

In [None]:
_, Cm = gender_validation('Gender w/ face height > 0.2', big_face_stats)

def P(y, yhat):
    d = {'M': 0, 'F': 1, 'U': 2}
    return float(Cm[d[y]][d[yhat]]) / sum([Cm[i][d[yhat]] for i in d.values()])

In [None]:
MALE = Gender.objects.get(name='M')
FEMALE = Gender.objects.get(name='F')
UNKNOWN = Gender.objects.get(name='U')
gender_names = {g.id: g.name for g in Gender.objects.all()}

def gender_stats(key, labels, min_dur=None, no_host=False, just_host=False):
    df0 = face_genders
    if no_host:
        df0 = df0.where(df0.is_host == False)        
    if just_host:
        df0 = df0.where(df0.is_host == True)
        
    if key == 'topic':        
        raise Exception("TODO")
        df1 = df0.join(segment_links, df0.segment_id == segment_links.segment_id)
        df2 = df1.join(things, segment_links.thing_id == things.id)
        topic_type = ThingType.objects.get(name='topic').id
        df3 = df2.where(things.type_id == topic_type).select(
            *(['duration', 'channel_id', 'show_id', 'hour', 'week_day', 'gender_id'] +  \
              [things.id.alias('topic'), 'shot_id']))
        full_df = df3
    else:
        full_df = df0
        
    groups = ([key] if key is not None else []) + ['gender_id']
    rows = full_df.groupBy(*groups).agg(func.sum('duration')).collect()
        
    out_rows = []
    for label in labels:
        label_rows = {row.gender_id: row for row in rows if key is None or row[key] == label['id']}
        print(label, len(label_rows))
        if len(label_rows) < 2: continue
        male_dur = int(label_rows[MALE.id]['sum(duration)'])
        female_dur = int(label_rows[FEMALE.id]['sum(duration)'])
        unknown_dur = int(label_rows[UNKNOWN.id]['sum(duration)']) if UNKNOWN.id in label_rows else 0
        base_dur = male_dur + female_dur
        if min_dur != None and base_dur < min_dur:
            continue
        out_rows.append({
            key: label['name'],
            'M': format_time(male_dur),
            'F': format_time(female_dur),
            'U': format_time(unknown_dur),
            'base': format_time(base_dur),
            'M%': int(100.0 * male_dur / base_dur),
            'F%': int(100.0 * female_dur / base_dur),
            'U%': int(100.0 * unknown_dur / (base_dur + unknown_dur)),
            'Overlap': 0,
        })
    return out_rows

gender_ordering = ['M', 'M%', 'F', 'F%']

## All genders

In [None]:
gender_screen_all = gender_stats(None, [{'id': 0, 'name': 'whole dataset'}])
gender_screen_all_nh = gender_stats(None, [{'id': 0, 'name': 'whole dataset'}], 
                                               no_host=True)
show_df(gender_screen_all, gender_ordering)

## Gender by channel

In [None]:
show_df(
    gender_stats('channel_id', list(Channel.objects.values('id', 'name'))),
    ['channel_id'] + gender_ordering)

## Gender by show

In [None]:
gender_screen_show = gender_stats('show_id', list(Show.objects.values('id', 'name')), min_dur=3600*250)
gender_screen_show_nh = gender_stats('show_id', list(Show.objects.values('id', 'name')), min_dur=3600*250, no_host=True)
gender_screen_show_jh = gender_stats('show_id', list(Show.objects.values('id', 'name')), min_dur=3600*50, just_host=True)
show_df(gender_screen_show, ['show_id'] + gender_ordering)

## Gender by canonical show

In [None]:
gender_screen_canonical_show = gender_stats(
    'canonical_show_id', 
    list(CanonicalShow.objects.values('id', 'name')), 
    min_dur=3600*250
)
gender_screen_canonical_show_nh = gender_stats(
    'canonical_show_id', 
    list(CanonicalShow.objects.values('id', 'name')), 
    min_dur=3600*250, 
    no_host=True
)
gender_screen_canonical_show_jh = gender_stats(
    'canonical_show_id', 
    list(CanonicalShow.objects.values('id', 'name')), 
    min_dur=3600*50, 
    just_host=True
)
show_df(gender_screen_canonical_show, ['canonical_show_id'] + gender_ordering)

## Gender by time of day

In [None]:
gender_screen_tod = gender_stats('hour', [{'id': hour, 'name': format_hour(hour)} for hour in hours])
show_df(gender_screen_tod, ['hour'] + gender_ordering)   

## Gender by day of week

In [None]:
dotw = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
show_df(
    gender_stats('week_day', [{'id': i + 1, 'name': d} for i, d in enumerate(dotw)]),
    ['week_day'] + gender_ordering)

## Gender by topic

In [None]:
segment_topics.groupBy('segment_id').agg(func.count('segment_id')).where('count(segment_id)== 1').count()

In [None]:
gender_screen_topic = gender_stats(
        'topic', [{'id': t.id, 'name': t.name} for t in Topic.objects.all()],
        min_dur=3600*300)
gender_screen_topic_nh = gender_multicount_stats(
        'topic', [{'id': t.id, 'name': t.name} for t in Topic.objects.all()],
        min_dur=3600*300, no_host=True)
show_df(gender_screen_topic, ['topic'] + gender_ordering)