In [None]:
from esper.prelude import *

GUEST_LIST = [name.lower() for name in ['Barack Obama', 'Donald Trump', 'Ted Cruz', 'John Kasich', 'Marco Rubio', 'Ben Carson', 'Jeb Bush',
'Jim Gilmore', 'Chris Christie', 'Carly Fiorina', 'Rick Santorum', 'Rand Paul', 'Mike Huckabee',
'Hillary Clinton', 'Bernie Sanders', 'Lincoln Chafee', 'Martin O’Malley', 'Jim Webb',
'Sarah Palin', 'John Boehner', 'Paul Ryan', 'Newt Gingrich','Nancy Pelosi','Elizabeth Warren', 'Mitch McConnell',
'Chuck Schumer','Harry Reid','Joe Biden', 'Kevin McCarthy', 'Steve Scalise', 'Bobby Jindal', 'John Cornyn',
'Dick Durbin','Orrin Hatch', 'Lindsey Graham', 'Mitt Romney', 'Michelle Obama' ,'Bill Clinton', 
'George W Bush', 'Tim Kaine' ]]
HOST_LIST = list(set([h.name for s in CanonicalShow.objects.exclude(hosts=None) for h in s.hosts.all()]))
VIDEOS = sorted([v.id for v in Video.objects.exclude(show__hosts=None)])

def get_name_to_labeler_id():
    from tqdm import tqdm
    def get_labeler_ids(n):
        from query.models import Labeler
        labeler_names = ['face-identity:'+n, 'face-identity-converted:'+n, 'face-identity-uncommon:'+n]
        return [l.id for l in Labeler.objects.filter(name__in=labeler_names)]
    names = GUEST_LIST+HOST_LIST
    output = {}
    for n in tqdm(names):
        output[n] = get_labeler_ids(n)
    return output

NAME_TO_LABELER_ID = get_name_to_labeler_id()

def name_to_id(name):
    from query.models import Identity
    return Identity.objects.get(name=name).id

GUEST_IDS=[name_to_id(n) for n in GUEST_LIST]
HOST_IDS=[name_to_id(n) for n in HOST_LIST]

def get_fps_map(vids):
    from query.models import Video
    vs = Video.objects.filter(id__in=vids)
    return {v.id: v.fps for v in vs}

def frame_second_conversion(c, mode='f2s'):
    from rekall.video_interval_collection_3d import VideoIntervalCollection3D
    from rekall.interval_set_3d import Interval3D
    fps_map = get_fps_map(set(c.get_allintervals().keys()))
    
    def second_to_frame(fps):
        def map_fn(intrvl):
            i2 = intrvl.copy()
            t1,t2 = intrvl.t
            i2.t = (int(t1*fps), int(t2*fps))
            return i2
        return map_fn
    
    def frame_to_second(fps):
        def map_fn(intrvl):
            i2 = intrvl.copy()
            t1,t2 = intrvl.t
            i2.t = (int(t1/fps), int(t2/fps))
            return i2
        return map_fn
    
    if mode=='f2s':
        fn = frame_to_second
    if mode=='s2f':
        fn = second_to_frame
    output = {}
    for vid, intervals in c.get_allintervals().items():
        output[vid] = intervals.map(fn(fps_map[vid]))
    return VideoIntervalCollection3D(output)

def frame_to_second_collection(c):
    return frame_second_conversion(c, 'f2s')

def second_to_frame_collection(c):
    return frame_second_conversion(c, 's2f')

def convert_to_1d_collection(collection):
    from rekall.interval_list import Interval
    from rekall.video_interval_collection import VideoIntervalCollection
    video_map = collection.get_allintervals()
    return VideoIntervalCollection({vid: [Interval(
        i.t[0], i.t[1], None) for i in video_map[vid].get_intervals()] for vid in video_map})

def display_result(collection_1d):
    from esper.rekall import intrvllists_to_result
    results = intrvllists_to_result(collection_1d.get_allintervals())
    return esper_widget(results,
            crop_bboxes=False, show_middle_frame=False, disable_captions=True,
            results_per_page=25, jupyter_keybindings=True)

# time dimension in seconds
def get_commercial_intervals_in_vids(vids):
    from query.models import Commercial
    from rekall.video_interval_collection_3d import VideoIntervalCollection3D
    
    qs = Commercial.objects.filter(video_id__in=vids)
    return frame_to_second_collection(VideoIntervalCollection3D.from_django_qs(qs))

# time dimension in seconds
# Outputs a dictionary from name to video interval collection
def get_person_intervals_in_vids(person_names, vids, probability=0.7, min_height=None):
    from query.models import FaceIdentity
    from django.db.models import F,Q
    from rekall.video_interval_collection_3d import VideoIntervalCollection3D
    from rekall.interval_set_3d import Interval3D
    from rekall.interval_set_3d_utils import P
    
    SAMPLE_RATE = 3 # Every 3s

    lids = []
    for n in person_names:
        lids.extend(NAME_TO_LABELER_ID[n])
    
    face_id_qs = FaceIdentity.objects.filter(
        probability__gte=probability,
        face__frame__video_id__in=vids,
        face__frame__shot_boundary=False,
        labeler_id__in=lids,
    ).annotate(
        height=F('face__bbox_y2')-F('face__bbox_y1'),
        labeler_name=F('labeler__name'),
        video_id=F('face__frame__video_id'),
        frame_number=F('face__frame__number'),
        x1=F('face__bbox_x1'),
        x2=F('face__bbox_x2'),
        y1=F('face__bbox_y1'),
        y2=F('face__bbox_y2'),
    )
    if min_height is not None:
        face_id_qs = face_id_qs.filter(height__gte=min_height)
    
    total = face_id_qs.count()
    faces = VideoIntervalCollection3D.from_django_qs(face_id_qs, {
        't1':'frame_number',
        't2':'frame_number',
        'x1':'x1','x2':'x2','y1':'y1','y2':'y2',
    }, with_payload=lambda row: row.labeler_name.split(':')[1], progress=True, total=total)

    fps_map = get_fps_map(set(faces.get_allintervals().keys()))
    names_to_collection = {}
    for n in person_names:
        faces_one_person = faces.filter(P(lambda p: p==n))
        output = {}
        for vid, intervals in faces_one_person.get_allintervals().items():
            fps = fps_map[vid]
            eps = fps * SAMPLE_RATE
            output[vid] = intervals.temporal_coalesce(epsilon=eps)
        names_to_collection[n] = frame_to_second_collection(VideoIntervalCollection3D(output))
    return names_to_collection

# Returns interval_IS<person_only_IS<>, host_only_IS<>, person_with_host_IS<>>
def interview_query(guest, hosts, commercials):
    from rekall.interval_set_3d import Interval3D
    from rekall.interval_set_3d_utils import T, P, or_preds, overlap_bound
    from rekall.temporal_predicates import overlaps, before, after
    
    SEGMENT_LENGTH=30
    OVERLAP_LAX=60
    HOST_GUEST_GAP=120
    MIN_LENGTH=240
    SMALL_FACE_THRESHOLD=0.3
    MIN_GUEST_TIME_RATIO=0.35
    MAX_SMALL_GUEST_RATIO=0.7
    
    fuzzy_overlap = or_preds(overlaps(), before(max_dist=OVERLAP_LAX), after(max_dist=OVERLAP_LAX))
    
    interview_candidates = hosts.merge(guest, T(fuzzy_overlap), time_window=OVERLAP_LAX).temporal_coalesce()
    
    interviews = interview_candidates.temporal_coalesce(
        epsilon=HOST_GUEST_GAP
    ).filter_size(min_size=MIN_LENGTH
    ).minus(commercials
    ).filter_size(min_size=MIN_LENGTH)
    

    def select_second(p):
        return p[1]
    
    # Interview<Guest<height>>
    interview_with_guest = interviews.collect_by_interval(
        guest,
        T(overlaps()),
        filter_empty=True,
        time_window=0,
    ).map_payload(
        select_second)
    
    
    def total_time(intervals):
        return intervals.fold(lambda s, i: s+i.length(), 0)
    
    def filter_time(interview):
        guest = interview.payload
        small_guest = guest.filter_size(max_size=SMALL_FACE_THRESHOLD, axis='Y')
        small_guest_time = total_time(small_guest)
        total_guest_time = total_time(guest)
        segment_time = interview.length()
        return (total_guest_time / segment_time > MIN_GUEST_TIME_RATIO and
                small_guest_time / total_guest_time < MAX_SMALL_GUEST_RATIO)
    # Interview<Guest<height>>
    interviews = interview_with_guest.filter(filter_time)
    
    # Guest<height>
    guest_in_interviews = guest.filter_against(interviews, T(overlaps()), time_window=0)
    # HostAndGuest<(Host, Guest)>
    guest_with_host = guest_in_interviews.join(
        hosts,
        T(overlaps()),
        lambda guest, host: [Interval3D(overlap_bound(guest.t, host.t), payload=(guest, host))],
        time_window=0)
    guest_only = guest_in_interviews.minus(guest_with_host)

    hosts_in_interviews = hosts.filter_against(interviews, T(overlaps()), time_window=0)
    hosts_only = hosts_in_interviews.minus(guest_with_host)
    
    interview_with_metadata = interviews.collect_by_interval(
        guest_only,
        T(overlaps()),
        filter_empty=False,
        time_window=0
    ).map_payload(select_second).collect_by_interval(
        hosts_only,
        T(overlaps()),
        filter_empty=False,
        time_window=0
    ).collect_by_interval(
        guest_with_host,
        T(overlaps()),
        filter_empty=False,
        time_window=0
    ).map_payload(lambda p: (p[0][0],p[0][1],p[1]))
    
    return interview_with_metadata

def get_interviews_for_vids(vids):
    from rekall.video_interval_collection_3d import VideoIntervalCollection3D
    from tqdm import tqdm
    
    people_to_intervals = get_person_intervals_in_vids(HOST_LIST + GUEST_LIST, vids, 0.7,0.2)
    hosts = VideoIntervalCollection3D({})
    for host_name in HOST_LIST:
        hosts = hosts.union(people_to_intervals[host_name])
    commercials = get_commercial_intervals_in_vids(vids)
    ret = VideoIntervalCollection3D({})
    for guest_name in tqdm(GUEST_LIST):
        guest = people_to_intervals[guest_name]
        interviews = interview_query(guest, hosts, commercials)
        ret = ret.union(interviews)
    return ret       

In [None]:
import ipyparallel as ipp
from esper.rekall_parallel import get_runtime_for_ipython_cluster
import pickle
c = ipp.Client(profile='local')
rt = get_runtime_for_ipython_cluster(c)

vids = VIDEOS[:100000]
answer,_ = rt.run(get_interviews_for_vids, vids, randomize=False, chunksize=15, progress=True)
pickle.dump(answer, open('../data/interviews/interviews-{0}.pickle'.format(vids[-1]), 'wb'))

# Scratchpad

In [None]:
vids = [[763, 3769, 5281, 8220, 9901, 12837, 13141, 26386, 33004, 33004, 34642, 38275, 42756, 50164, 50164, 50164, 50164, 50164, 50164, 50164, 52075, 52945, 54377, 54377, 59122, 59122, 59398, 59398, 59398, 59398]]

answer = get_person_intervals_in_vids(HOST_LIST + GUEST_LIST, vids, 0.7,0.2)
display_result(convert_to_1d_collection(second_to_frame_collection(answer['bernie sanders'])))

In [None]:
display_result(convert_to_1d_collection(second_to_frame_collection(answer['jake tapper'])))

In [None]:
answer = get_interviews_for_vids(vids)

In [None]:
display_result(convert_to_1d_collection(second_to_frame_collection(answer)))

In [None]:
ls=[l.name for l in Labeler.objects.all() if l.name.startswith('face-identity:') or l.name.startswith('face-identity-converted:') or l.name.startswith('face-identity-uncommon:')]

In [None]:
ls=[l.split(':')[1] for l in ls]

In [None]:
for g in GUEST_LIST:
    if g not in ls:
        print(g)

In [None]:
for h in HOST_LIST:
    if h not in ls:
        print(h)

In [None]:
ls

In [None]:
sorted(HOST_LIST)

In [None]:
interviews = LabeledInterview.objects \
        .annotate(fps=F('video__fps')) \
        .annotate(min_frame=F('fps') * F('start')) \
        .annotate(max_frame=F('fps') * F('end')) \
        .filter(guest1="bernie sanders", original=True)
print([i.video.id for i in interviews])

In [None]:
len(vids)

In [None]:
answer.get_allintervals()[10]

In [None]:
answer

In [None]:
len(VIDEOS)

In [None]:
VIDEOS[:100]

In [None]:
Video.objects.count()