# Setup

In [None]:
import os
import time
import string 

import datetime

import pickle
import itertools
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from gensim.models import KeyedVectors

from math import ceil

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sentence_transformers import models, SentenceTransformer, util

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

matplotlib.rcParams['figure.figsize'] = [12, 5]

%matplotlib inline

In [None]:
segs = pd.read_csv('data/ina_subtitles/segments_subs.csv').drop(columns=['Unnamed: 0'])
subs = pd.read_csv('data/ina_subtitles/subbed_segments.csv').drop(columns=['Unnamed: 0']).sort_values(['program', 'start_s'])

In [None]:
def get_program_title(program_id, segs=segs):
    program_title = segs[segs['segment_id'] == program_id].title.values[0]
    return program_title

In [None]:
sbert = SentenceTransformer('distiluse-base-multilingual-cased') 

In [None]:
unique_programs = segs.program.unique()
print(unique_programs)

In [None]:
segs.head()

In [None]:
segs[segs.program == '5269874_001'].head()

In [None]:
subs[subs.program == '5266045_001'].head(5)

In [None]:
segs[segs['program'] == '5269874_001'][['title']].head(5)

# Visualization

In [None]:
from datetime import datetime
import matplotlib.ticker as ticker

In [None]:
program_id = '5269874_001'

titles = segs[segs['program'] == '5269874_001']['title'].values
times  = segs[segs['program'] == '5269874_001']['start_s'].values

In [None]:
times

In [None]:
def visualize_timeline(times, titles):
    levels = np.array([-1, 1, -3, 3, -5, 5])
    fig, ax = plt.subplots(figsize=(15, 7))

    # Create the base line
    start = min(times)
    stop = max(times)

    # Draw the timeline
    ax.plot((start, stop), (0, 0), 'k', alpha=.5)

    # Iterate through releases annotating each one
    for ii, (ititle, itime) in enumerate(zip(titles, times)):
        ititle = ititle[:20] + ('...' if len(ititle) > 20 else '')
        level = levels[ii % 6]
        vert = 'top' if level < 0 else 'bottom'

        # Draw a circle at the timeline
        ax.scatter(itime, 0, s=100, facecolor='w', edgecolor='k', zorder=9999)

        # Draw a line up to the text
        ax.plot((itime, itime), (0, level), linewidth=2, c='c', alpha=.5)

        # Give the text a faint background and align it properly
        ax.text(itime, level, ititle,
                horizontalalignment='right', verticalalignment=vert, fontsize=10,
                backgroundcolor=(.2, .55, .6, .05))

    # Set the xticks formatting
    # format xaxis with 3 month intervals
    # ax.get_xaxis().set_major_locator(mdates.MonthLocator(interval=3))
    # ax.get_xaxis().set_major_formatter(mdates.DateFormatter("%b %Y"))
    # fig.autofmt_xdate()

    ax.xaxis.set_major_locator(ticker.MultipleLocator(600))
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(60))
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda tick, pos: str(int(tick/60)) + ' mins'))

    # Remove the figure border and y-axis (levels) components for a cleaner look
    plt.setp((ax.get_yticklabels() + ax.get_yticklines() +
              list(ax.spines.values())), visible=False)
    plt.title("Program " + program_id, y=-0.16, fontsize=14)
    plt.show()

In [None]:
program_id = '5269232_001'

titles = segs[segs['program'] == program_id]['title'].values
times  = segs[segs['program'] == program_id]['start_s'].values
visualize_timeline(times, titles)

# INA: Generating content representations

In [None]:
if os.path.exists('./segments_data.pickle'):
    segments_data = pickle.load(open('./segments_data.pickle', 'rb'))

else:
    segments_data = {}
    for program_id in tqdm(segs.program.unique()):
        segments = segs[(segs['program'] == program_id) & (segs['segment_id'] != program_id)]
        segments_data[program_id] = {'n_segments': len(segments)}
        segments_data[program_id]['start'] = segments.start_s.values.tolist()
        segments_data[program_id]['duration'] = segments.duration_s.values.tolist()
        segments_data[program_id]['end'] = segments.end_s.values.tolist()
        segments_data[program_id]['text'] = segments.title.values.tolist()
        segments_data[program_id]['embeddings'] = sbert.encode(segments.title.str.lower().values.tolist(), convert_to_tensor=True)
    pickle.dump(segments_data, open('./segments_data.pickle', 'wb'))

In [None]:
if os.path.exists('./subtitles_data.pickle'):
    subtitles_data = pickle.load(open('./subtitles_data.pickle', 'rb'))

else:
    subtitles_data = {}
    for program_id in tqdm(subs.program.unique()): # ['5265990_001']: # 
        subtitles = subs[subs['program'] == program_id]
        subtitles_data[program_id] = {'n_subs': len(subtitles)}
        subtitles_data[program_id]['start'] = subtitles.start_s.values.tolist()
        subtitles_data[program_id]['duration'] = subtitles.duration_s.values.tolist()
        subtitles_data[program_id]['end'] = subtitles.end_s.values.tolist()
        subtitles_data[program_id]['text'] = subtitles.content.values.tolist()
        subtitles_data[program_id]['embeddings'] = sbert.encode(subtitles.content.values.tolist(), convert_to_tensor=True)
    pickle.dump(subtitles_data, (open('./subtitles_data.pickle', 'wb'))

In [None]:
sim_with_title = {}

for program_id in tqdm(segs.program.unique()):    
    res = util.pytorch_cos_sim(segments_data[program_id]['embeddings'], subtitles_data[program_id]['embeddings']).numpy()
    sim_with_title[program_id] = res

In [None]:
subtitles_sim = {}

for program_id in tqdm(segs.program.unique()):    
    res = util.pytorch_cos_sim(subtitles_data[program_id]['embeddings'], subtitles_data[program_id]['embeddings']).numpy()
    subtitles_sim[program_id] = res
    
    fig=plt.figure(figsize=(9,9))
    plt.title(program_id)
    sns.heatmap(subtitles_sim[program_id]).plot()

In [None]:
def visualize_segmentation(segments_starts, titles, hypothesis, scores, program_title=''):
    levels = np.array([-1, 1, -3, 3, -5, 5])
    levels_2 = np.array([-0.5, 0.5, -2.5, 2.5, -4.5, 4.5])
    fig, ax = plt.subplots(figsize=(15, 7))

    # Create the base line
    start = min(min(segments_starts), min(hypothesis))
    stop = max(max(segments_starts), max(hypothesis))

    # Draw the timeline
    ax.plot((start, stop), (0, 0), 'k', alpha=.5)

    # Iterate through releases annotating each one
    for ii, (ititle, itime) in enumerate(zip(titles, segments_starts)):
        ititle = ititle[:20] + ('...' if len(ititle) > 20 else '')
        level = levels[ii % 6]
        vert = 'top' if level < 0 else 'bottom'

        # Draw a circle at the timeline
        ax.scatter(itime, 0, s=100, facecolor='w', edgecolor='k', zorder=9999)

        # Draw a line up to the text
        ax.plot((itime, itime), (0, level), linewidth=2, c='c', alpha=.5)

        # Give the text a faint background and align it properly
        ax.text(itime, level, ititle,
                horizontalalignment='right', verticalalignment=vert, fontsize=10,
                backgroundcolor=(.2, .55, .6, .05))
    
    sub_ids, scores = list(zip(*scores))
    sub_labels = [f'sub_{s} ({i})' for i, s in enumerate(sub_ids)]

    data = sorted(zip(sub_labels, hypothesis), key=lambda x:x[1])
    
    # Iterate through releases annotating each one
    for ii, (ititle, itime) in enumerate(data):
        level = levels_2[ii % 6]
        vert = 'top' if level < 0 else 'bottom'

        # Draw a circle at the timeline
        ax.scatter(itime, 0, s=100, facecolor='w', edgecolor='gold', zorder=9999)

        # Draw a line up to the text
        ax.plot((itime, itime), (0, level), linewidth=2, c=(.92, .2, .4), alpha=.5)

        # Give the text a faint background and align it properly
        ax.text(itime, level, ititle,
                horizontalalignment='right', verticalalignment=vert, fontsize=10,
                backgroundcolor=(.92, .2, .4, .05))

    try:
        major_locator = int(stop / 100) * 10
    except:
        print("Exception")
        print(segments_starts)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(major_locator))
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(major_locator / 10))
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda tick, pos: str(int(tick/60)) + ' mins'))

    # Remove the figure border and y-axis (levels) components for a cleaner look
    plt.setp((ax.get_yticklabels() + ax.get_yticklines() +
              list(ax.spines.values())), visible=False)
    plt.title(program_id + (' - ' + program_title if program_title else ''), y=-0.16, fontsize=14)
    plt.show()

In [None]:
def process_program(program_id, window_size=3, scoring_method='avg', visualize=True,
                    segs=segs, subtitles_data=subtitles_data, segments_data=segments_data):
    sim_matrix = subtitles_sim[program_id]
    n_segments = segments_data[program_id]['n_segments'] + 1
    N = sim_matrix.shape[0]
    scores_dic = {'avg':[], 'mul':[]}
    
    for i in range(N):
        neighbors = range(i, min(i+window_size, N))
        neighbors_scores = [sim_matrix[i][j] for j in neighbors]
        scores_dic['avg'].append(np.mean(neighbors_scores))
        scores_dic['mul'].append(np.product(neighbors_scores))
    
    scores = scores_dic[scoring_method]
    minima = [(i, score) for i, score in sorted(enumerate(scores), key=lambda x: x[1])]
    minima_starts = [subtitles_data[program_id]['start'][i] for i, s in minima]
    
    titles = segs[segs['program'] == program_id]['title'].values
    times  = segs[segs['program'] == program_id]['start_s'].values
    program_title = segs[segs['segment_id'] == program_id]['title'].values[0]

    if visualize:
        visualize_segmentation(times, titles, minima_starts[:n_segments], minima[:n_segments], program_title)
    
    return times, minima_starts, minima, n_segments

In [None]:
_ = process_program('5265997_001', window_size=3, scoring_method='avg')

In [None]:
process_program('5265997_001', window_size=3, scoring_method='mul')

# Metrics

In [None]:
from nltk.metrics.segmentation import pk, windowdiff

In [None]:
def compute_metrics(program_id, window_size=3, k=50, scoring_method='avg'):
    gt_times, hypo_times, scores, n_segments = process_program(program_id, window_size=window_size, 
                                                               scoring_method=scoring_method, visualize=False)
    duration = int(segs[segs.segment_id == program_id]['duration_s'].values[0])
    duration = max(duration, int(max(gt_times))) + 1
    
    seg_true = ['0'] * duration
    seg_hypo = ['0'] * duration
    
    # assert(int(max(gt_times)) <= duration)
    
    for i in range(n_segments):
        try:
            seg_true[int(gt_times[i])] = '1'
            seg_hypo[int(hypo_times[i])] = '1'
        except:
            print(program_id, gt_times, n_segments)

    for i in range(len(hypo_times)):
        seg_hypo[int(hypo_times[i])] = '1'
    
    seg_true = ''.join(seg_true)
    seg_hypo = ''.join(seg_hypo)
    
    print(program_id)
    print('Pk:         ', pk(seg_true, seg_hypo, k))
    print('WindowDiff: ', windowdiff(seg_true, seg_hypo, k))

In [None]:
segs[segs.segment_id != segs.program]['duration_s'].mean()

In [None]:
for program_id in unique_programs:
    compute_metrics(program_id,  k=3)

In [None]:
_ = process_program('5266518_001', window_size=3, scoring_method='avg')

In [None]:
compute_metrics('5266518_001',  k=3)

In [None]:
_ = process_program('5269476_001', window_size=3, scoring_method='avg')

In [None]:
compute_metrics('5269476_001',  k=42)

In [None]:
_ = process_program('5265990_001', window_size=3, scoring_method='avg')

In [None]:
compute_metrics('5265990_001',  k=42)

In [None]:
_ = process_program('5269507_001', window_size=3, scoring_method='avg')

In [None]:
compute_metrics('5269507_001',  k=42)

In [None]:
segs[segs.segment_id == '5270036_001']

# Yle: Generating content representations

In [None]:
subs = pd.read_csv('data/yle_subtitles/urheiluruutu_subs.csv')
segs = pd.read_csv('data/yle_subtitles/urheiluruutu_segs.csv').sort_values(['program', 'start_s'])

In [None]:
segs.head()

In [None]:
segs.program.unique()

In [None]:
if not os.path.exists('./segments_data_yle.pickle'):
    segments_data = pickle.load(open('./segments_data_yle.pickle', 'rb'))

else:
    segments_data = {}
    for program_id in tqdm(segs.program.unique()):
        try:
            segments = segs[(segs['program'] == program_id) & (segs['segment_id'] != program_id)]
            segments_data[program_id] = {'n_segments': len(segments)}
            segments_data[program_id]['start'] = segments.start_s.values.tolist()
            segments_data[program_id]['duration'] = segments.duration_s.values.tolist()
            segments_data[program_id]['end'] = segments.end_s.values.tolist()
            segments_data[program_id]['text'] = segments.title.values.tolist()
            segments_data[program_id]['embeddings'] = sbert.encode(segments.title.str.lower().values.tolist(), convert_to_tensor=True)
        except Exception as e:
            print(program_id, str(e))
    pickle.dump(segments_data, open('./segments_data_yle.pickle', 'wb'))

In [None]:
if not os.path.exists('./subtitles_data_yle.pickle'):
    subtitles_data = pickle.load(open('./subtitles_data_yle.pickle', 'rb'))

else:
    subtitles_data = {}
    for program_id in tqdm(subs.program.unique()): # ['5265990_001']: # 
        subtitles = subs[subs['program'] == program_id]
        subtitles_data[program_id] = {'n_subs': len(subtitles)}
        subtitles_data[program_id]['start'] = subtitles.start_s.values.tolist()
        subtitles_data[program_id]['duration'] = subtitles.duration_s.values.tolist()
        subtitles_data[program_id]['end'] = subtitles.end_s.values.tolist()
        subtitles_data[program_id]['text'] = subtitles.content.values.tolist()
        subtitles_data[program_id]['embeddings'] = sbert.encode(subtitles.content.values.tolist(), convert_to_tensor=True)
    pickle.dump(subtitles_data, open('./subtitles_data_yle.pickle', 'wb'))

In [None]:
sim_with_title = {}

for program_id in tqdm(segs.program.unique()):    
    try:
        res = util.pytorch_cos_sim(segments_data[program_id]['embeddings'], subtitles_data[program_id]['embeddings']).numpy()
        sim_with_title[program_id] = res
    except Exception as e:
        print(program_id, str(e))

In [None]:
subtitles_sim = {}

for program_id in tqdm(segs.program.unique()):    
    res = util.pytorch_cos_sim(subtitles_data[program_id]['embeddings'], subtitles_data[program_id]['embeddings']).numpy()
    subtitles_sim[program_id] = res
    
    fig=plt.figure(figsize=(9,9))
    plt.title(program_id)
    sns.heatmap(subtitles_sim[program_id]).plot()

In [None]:
for program_id in segs.program.unique():
    print(program_id)
    try:
        process_program(program_id, window_size=3, scoring_method='avg', segs=segs, subtitles_data=subtitles_data, segments_data=segments_data)
    except:
        print(program_id)

In [None]:
segments_data['PROG_2020_00823937']['n_segments']

In [None]:
segs[segs['segment_id'] == 'PROG_2020_00823937']['title'].values[0]

In [None]:
segments_data_jorma = {}
for key in segments_data:
    segments_data_jorma[key] = {}
    segments_data_jorma[key]['start'] = segments_data[key]['start']
    segments_data_jorma[key]['end'] = segments_data[key]['end']

In [None]:
segs[segs['program'] == 'PROG_2020_00823938']

In [None]:
segments_data_jorma

In [None]:
subtitles_data_jorma = {}
for key in segments_data:
    subtitles_data_jorma[key] = {}
    subtitles_data_jorma[key]['start'] = subtitles_data[key]['start']
    subtitles_data_jorma[key]['end'] = subtitles_data[key]['end']
    subtitles_data_jorma[key]['duration'] = subtitles_data[key]['duration']

In [None]:
pickle.dump(subtitles_data_jorma, open('yle_subtitles_timestamps.pickle', 'wb'))
pickle.dump(segments_data_jorma, open('yle_parts_timestamps.pickle', 'wb'))

In [None]:
pickle.dump(subtitles_sim, open('yle_subtitle_neighborhood_similarity.pickle', 'wb'))