<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Are-host-faces-shown-on-screen-larger-than-non-hostfaces?" data-toc-modified-id="Are-host-faces-shown-on-screen-larger-than-non-hostfaces?-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Are host faces shown on screen larger than non-hostfaces?</a></span><ul class="toc-item"><li><span><a href="#There-are-more-male-faces-in-general-so-counts-for-male-faces-are-higher." data-toc-modified-id="There-are-more-male-faces-in-general-so-counts-for-male-faces-are-higher.-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>There are more male faces in general so counts for male faces are higher.</a></span></li><li><span><a href="#What-if-we-normalized-and-compared-the-proportion-of-faces-in-each-height-bucket?" data-toc-modified-id="What-if-we-normalized-and-compared-the-proportion-of-faces-in-each-height-bucket?-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>What if we normalized and compared the proportion of faces in each height bucket?</a></span></li></ul></li><li><span><a href="#Does-this-pattern-hold-for-all-shows?" data-toc-modified-id="Does-this-pattern-hold-for-all-shows?-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Does this pattern hold for all shows?</a></span><ul class="toc-item"><li><span><a href="#Plot-the-distributions-of-male-and-female-faces-(descending-disparity)" data-toc-modified-id="Plot-the-distributions-of-male-and-female-faces-(descending-disparity)-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Plot the distributions of male and female faces (descending disparity)</a></span></li><li><span><a href="#Compare-the-mean-face-size-on-different-shows" data-toc-modified-id="Compare-the-mean-face-size-on-different-shows-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Compare the mean face size on different shows</a></span></li></ul></li></ul></div>

In [None]:
from esper.prelude import *
from esper.widget import *
from esper.spark_util import *
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

import math
from datetime import timedelta
from collections import defaultdict, OrderedDict

In [None]:
def plot_curves(curves, title, x_label, y_label, y_lim=None, x_lim=None):
    fig, ax1 = plt.subplots()

    for series, values in curves.items():
        x = []
        y = []
        for k in sorted(values):
            x.append(k)
            y.append(values[k])
        ax1.plot(x, y, 'o-', label=str(series))
        
    ax1.legend()
    ax1.set_title(title)
    ax1.set_ylabel(y_label)
    ax1.set_xlabel(x_label)
    if y_lim is not None:
        ax1.set_ylim(y_lim)
    if x_lim is not None:
        ax1.set_xlim(x_lim)
    plt.show()

In [None]:
face_genders = get_face_genders()
face_genders = face_genders.withColumn('is_host_identity', face_genders.host_probability >= 0.8)

# Are host faces shown on screen larger than non-hostfaces?

In [None]:
def empty_hists():
    return OrderedDict([
        (('M', True), {}),
        (('F', True), {}),
        (('M', False), {}),
        (('F', False), {})
    ])

gender_map = { g.id : g.name for g in Gender.objects.all() }

gender_host_hists = empty_hists()
for (gender_id, is_host, height), v in count_distinct_over_column(
            face_genders.where(face_genders.probability > 0.9),
            distinct_columns=[], #['face_id'],
            group_by_columns=['gender_id', 'is_host_identity', 'height'],
            group_by_key_fn=lambda x: (x[0], x[1], math.ceil(x[2] * 100))
        ).items():
    gender_host_hists[(gender_map[gender_id], is_host)][height] = v 

## There are more male faces in general so counts for male faces are higher.

In [None]:
plot_curves(gender_host_hists, 'Distribution of face area by gender and host', 'Height (% of frame)', 'Count')

## What if we normalized and compared the proportion of faces in each height bucket?

In [None]:
gender_hosts_hists_normalized = empty_hists()
for gender_host, hist in gender_host_hists.items():
    denom = sum(hist.values())
    for k, v in hist.items():
        gender_hosts_hists_normalized[gender_host][k] = v / denom
        
plot_curves(gender_hosts_hists_normalized, 'Distribution of face height by gender and host', 
            'Height (% of frame)', 'Proportion (normalized)')

# Does this pattern hold for all shows?

In [None]:
canonical_show_map = { c.id : c.name for c in CanonicalShow.objects.all() if c.name in MAJOR_CANONICAL_SHOWS }

gender_host_hists_by_show = defaultdict(lambda: empty_hists())
for (canonical_show_id, gender_id, is_host, height), v in count_distinct_over_column(
            face_genders.where(face_genders.probability > 0.9),
            distinct_columns=['face_id'],
            group_by_columns=['canonical_show_id', 'gender_id', 'is_host_identity', 'height'],
            group_by_key_fn=lambda x: (x[0], x[1], x[2], math.ceil(x[3] * 100))
        ).items():
    if canonical_show_id not in canonical_show_map:
        continue
    gender_host_hists_by_show[
        canonical_show_map[canonical_show_id]
    ][(gender_map[gender_id], is_host)][height] = v

## Plot the distributions of male and female faces (descending disparity)

In [None]:
gender_host_hists_by_show_normalized = defaultdict(lambda: empty_hists())
for canonical_show, show_gender_host_hists in gender_host_hists_by_show.items():
    for gh, hist in show_gender_host_hists.items():
        denom = sum(hist.values())
        for k, v in hist.items():
            gender_host_hists_by_show_normalized[canonical_show][gh][k] = v / denom
            
for canonical_show, show_gender_host_hists in sorted(
    gender_host_hists_by_show_normalized.items(), 
    key=lambda x: x[0]
):
    plot_curves({ 
            k : v for k, v in show_gender_host_hists.items() 
            if k[1] == False or len(v) >= 30 
        }, 
        'Distribution of face height by gender and host on "{}"'.format(
            canonical_show), 
        'Height (% of frame)', 
        'Proportion (normalized)'
    )

## Compare the mean face size on different shows

In [None]:
def plot_difference_in_mean_face_size(normalized_show_hists):
    x = []
    y = []
    for canonical_show, diff in sorted(
        { k : diff_in_avg_face_size(v) for k, v in 
          gender_host_hists_by_show_normalized.items() }.items(), 
        key=lambda x: x[1]
    ):
        x.append(canonical_show)
        y.append(diff)
    
    fig, ax1 = plt.subplots()
    ind = np.arange(len(x))
    width = 0.8
    rect = ax1.bar(ind - width / 2, y, width, 
                   label='MeanHeight(Male) - MeanHeight(Female)', 
                   color='LightBlue')
    ax1.set_ylabel('Mean face height (% of screen)')
    ax1.set_title('Difference in mean face height by gender')
    ax1.legend()
    ax1.set_ylim((-0.25,4.25))
    ax1.set_xticks(ind)
    ax1.set_xlabel('Show name')
    ax1.set_xticklabels(x, rotation=45, ha='right')
    plt.axhline(0., color='Black', linestyle='--')
    plt.show()

plot_difference_in_mean_face_size(gender_hists_by_show_normalized)