# The Boys - Screentime Analysis

Simple screentime analysis of **The Boys** (season 1-3) <br>
Content of the analysis:
- **Screentime**: per character, per episode
- **Runtime**
- **Shared screentime** between characters

Some visual: [Tableau Public](https://public.tableau.com/app/profile/mattia4114/viz/boys_16621399122060/TheBoys-ScreentimeAnalysis)

In [40]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Data

__Data Souce__:   _Amazon Prime Video_ <br>
__Collection and Preparation__:   _[link](https://www.curiousgnu.com/movie-character-screen-time)_

In [5]:
example = pd.read_csv('./Data/s01e01.csv')

In [6]:
example

Unnamed: 0,nconst,character,start,end
0,nm8488639,Benjy,32000,155000
1,nm4240263,Jamie,33000,155000
2,nm1069800,Queen Maeve,82000,155000
3,nm0651456,Desperate Thief #1,100000,155000
4,nm1102278,Homelander,130000,155000
...,...,...,...,...
81,nm1307435,Translucent,3165000,3423000
82,nm0881631,Billy Butcher,3226000,3423000
83,nm5092703,Mason,3426000,3490000
84,nm0637992,Mayor of Baltimore,3431000,3490000


## All characters in the show

In [7]:
def get_characters_from_episode(file):
    '''
    Given a specific episode it extract all the characters in it
    '''
    
    df = pd.read_csv(file)
    df = df.drop_duplicates(subset=['nconst'])
    df = df[['nconst', 'character']] # keep only characters id and name
    
    return df

In [8]:
episodes = os.listdir('./Data')

In [9]:
# extract all character from the first 3 seasons

characters = []

for episode in episodes:
    episode_path = os.path.join('./Data', episode)
    df = get_characters_from_episode(episode_path)
    characters.append(df)
    
characters = pd.concat(characters)
characters = characters.drop_duplicates(subset=['nconst'])
characters = characters.set_index('nconst')
characters

Unnamed: 0_level_0,character
nconst,Unnamed: 1_level_1
nm8488639,Benjy
nm4240263,Jamie
nm1069800,Queen Maeve
nm0651456,Desperate Thief #1
nm1102278,Homelander
...,...
nm8493111,Starlighter
nm7624031,Hometeamer
nm0498083,Doctor
nm6364830,Stormchaser


In [10]:
characters.to_csv('characters.csv')

## Screentime per episode

In [11]:
def get_character_screentime(file, characters):
    
    df = pd.read_csv(file)

    # convert from millisecond to second
    df['start'] = df['start']/1000 
    df['end'] = df['end']/1000

    # compute time on the screen
    df['screentime'] = df['end'] - df['start']

    # compute for each character the screentime for the episode
    df = df.groupby(by=['nconst'])[['screentime']].sum() 

    # not all character in every episode, join so they show up and put 0 screentime for the episode
    df = characters.join(df)
    df = df.fillna(0)

    # add the episode relative to this specific screentime
    df = df[['screentime']]
    df['season_episode'] = file[7:-4]
    df = df.round(2)
    
    return df

In [12]:
screentime = []

for episode in episodes:
    df = get_character_screentime(os.path.join('./Data',episode), characters)
    screentime.append(df)
    
screentime = pd.concat(screentime)
screentime

Unnamed: 0_level_0,screentime,season_episode
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1
nm8488639,123.0,s01e01
nm4240263,122.0,s01e01
nm1069800,510.0,s01e01
nm0651456,55.0,s01e01
nm1102278,253.0,s01e01
...,...,...
nm8493111,24.0,s03e08
nm7624031,46.0,s03e08
nm0498083,53.0,s03e08
nm6364830,15.0,s03e08


In [13]:
# extract some data
screentime['season'] = screentime['season_episode'].apply(lambda x: int(x[1:3]))
screentime['episode'] = screentime['season_episode'].apply(lambda x: int(x[-2:]))

# cumulative episode (can use as episode id)
screentime['cum_episode'] = screentime['episode'] + ((screentime['season']-1)*8)

screentime = screentime.drop(['season_episode'], axis=1)
screentime

Unnamed: 0_level_0,screentime,season,episode,cum_episode
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
nm8488639,123.0,1,1,1
nm4240263,122.0,1,1,1
nm1069800,510.0,1,1,1
nm0651456,55.0,1,1,1
nm1102278,253.0,1,1,1
...,...,...,...,...
nm8493111,24.0,3,8,24
nm7624031,46.0,3,8,24
nm0498083,53.0,3,8,24
nm6364830,15.0,3,8,24


In [14]:
screentime.to_csv('screentime_per_episode.csv')

In [15]:
# if i want to search for a character better have the name instead of id
screentime_with_name = screentime.join(characters)
screentime_with_name[screentime_with_name['character']=='Billy Butcher']

Unnamed: 0_level_0,screentime,season,episode,cum_episode,character
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0881631,1207.0,1,1,1,Billy Butcher
nm0881631,1374.0,1,2,2,Billy Butcher
nm0881631,2134.0,1,3,3,Billy Butcher
nm0881631,1604.0,1,4,4,Billy Butcher
nm0881631,751.0,1,5,5,Billy Butcher
nm0881631,1283.0,1,6,6,Billy Butcher
nm0881631,990.0,1,7,7,Billy Butcher
nm0881631,1222.0,1,8,8,Billy Butcher
nm0881631,61.0,2,1,9,Billy Butcher
nm0881631,1018.0,2,2,10,Billy Butcher


# Total screentime

In [16]:
# compute the total screentime in the 3 seasons for each character
total_screentime = screentime.groupby('nconst')[['screentime']].sum().sort_values(by='screentime', ascending=False)

# add the name for better interpretability
total_screentime = total_screentime.join(characters)

# convert to minutes
total_screentime['screentime'] = round(total_screentime['screentime']/60)

total_screentime

Unnamed: 0_level_0,screentime,character
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1
nm4425051,466.0,Hughie Campbell
nm0881631,447.0,Billy Butcher
nm3929195,342.0,Starlight
nm1102278,341.0,Homelander
nm0022306,320.0,Mother’s Milk
...,...,...
nm5045671,0.0,Guy
nm2555508,0.0,Samantha
nm0035334,0.0,Mamasan Matron
nm0640669,0.0,Self


## The Seven vs The Boys Screentime

In [17]:
the_boys = ['Hughie Campbell', 'Billy Butcher', 'Mother’s Milk', 'Frenchie', 'The Female']
the_seven = ['Starlight', 'Homelander', 'Queen Maeve', 'A-Train', 'The Deep', 'Stormfront', 'Black Noir', 'Translucent']

In [24]:
# the boys total screentime
the_boys_total = total_screentime[total_screentime['character'].isin(the_boys)]['screentime'].sum()

print(f'The Boys total screentime in minutes: {round(the_boys_total)}')
print(f'The Boys total screentime in hours: {round(the_boys_total/60)}')

The Boys total screentime in minutes: 1755
The Boys total screentime in hours: 29


In [25]:
# the seven total screentime
the_seven_total = total_screentime[total_screentime['character'].isin(the_seven)]['screentime'].sum()

print(f'The Seven total screentime in minutes: {round(the_seven_total)}')
print(f'The Seven total screentime in hours: {round(the_seven_total/60)}')

The Seven total screentime in minutes: 1252
The Seven total screentime in hours: 21


## Shared Screentime between characters

In [26]:
# we will use the following data structure
# d = {
#       char_1 = {
#                   char_2 = x,
#                   char_3 = y,
#                   ....
#                },
#       char_2 = {},
#       ....
#
# so for each character we can see the shared screentime with every other character


In [28]:
# use nested dictionaries as data structure
shared_screentime = {}

# initialize all the nested dictionaries (one per character)
for i in range(len(characters)):
    shared_screentime[characters.index[i]]  = {}

In [38]:
def get_shared_screentime(df, shared_screentime):
    '''
    given a specific episode it computes the shared screentime for each character to the data structure 
    '''
    
    for i in range(len(df)):
        j = i + 1
        while j < len(df) and df.end[i] > df.start[j]:
            shared = df.end[i] - df.start[j]
            shared_screentime[df.index[i]][df.index[j]] = shared_screentime.get(df.index[i], {}).get(df.index[j], 0) + shared
            shared_screentime[df.index[j]][df.index[i]] = shared_screentime.get(df.index[j], {}).get(df.index[i], 0) + shared
            j += 1
            
    return shared_screentime

In [41]:
for episode in episodes:
    df = pd.read_csv(os.path.join('./Data', episode))
    df = df.set_index('nconst')
    df = df[['start', 'end']]
    # in seconds
    df = df/1000
    
    shared_screentime = get_shared_screentime(df, shared_screentime)


In [43]:
# convert in dataframe
character_1 = []
character_2 = []
shared = []

for char_1, char_2 in shared_screentime.items():
    for char_2, time in char_2.items():
        character_1.append(char_1)
        character_2.append(char_2)
        shared.append(time)

temp = {
    'First_character' : character_1,
    'Second_character' : character_2,
    'Shared_time' : shared
}

shared_screentime = pd.DataFrame(temp)

In [44]:
shared_screentime

Unnamed: 0,First_character,Second_character,Shared_time
0,nm8488639,nm4240263,488.0
1,nm8488639,nm1069800,292.0
2,nm8488639,nm0651456,220.0
3,nm8488639,nm1102278,100.0
4,nm4240263,nm8488639,488.0
...,...,...,...
3167,nm7624031,nm7805172,24.0
3168,nm0498083,nm0881631,212.0
3169,nm6364830,nm7624031,16.0
3170,nm5045671,nm1102278,36.0


In [45]:
# sort for max shared time
shared_screentime = shared_screentime.sort_values(by = ['Shared_time'], ascending = False)

# replace code with charecters' name
shared_screentime['First_character'] = shared_screentime['First_character'].apply(lambda x : characters.loc[x][0])
shared_screentime['Second_character'] = shared_screentime['Second_character'].apply(lambda x : characters.loc[x][0])

# in minutes
shared_screentime['Shared_time'] = round(shared_screentime['Shared_time']/60)

In [46]:
shared_screentime

Unnamed: 0,First_character,Second_character,Shared_time
830,Billy Butcher,Hughie Campbell,1043.0
220,Hughie Campbell,Billy Butcher,1043.0
1249,Mother’s Milk,Hughie Campbell,832.0
234,Hughie Campbell,Mother’s Milk,832.0
848,Billy Butcher,Mother’s Milk,811.0
...,...,...,...
2938,Claudio,Young Black Noir,0.0
2939,Claudio,Soldier Boy,0.0
2943,Young Black Noir,Claudio,0.0
3001,Blue Hawk,Doctor,0.0


In [47]:
shared_screentime = shared_screentime.reset_index(drop = True)

In [48]:
# each value figure twice, we can remove one

# create a new id for each couple of characters
shared_screentime['unique'] = shared_screentime['First_character'] + shared_screentime['Second_character']

# we sort each name in the new id in alphabetic order
shared_screentime['unique'] = shared_screentime['unique'].apply(lambda x: ''.join(sorted(x)))

# remove rows with same "couple" id
shared_screentime = shared_screentime.drop_duplicates(subset='unique')
shared_screentime = shared_screentime.drop('unique', axis=1)

shared_screentime

Unnamed: 0,First_character,Second_character,Shared_time
0,Billy Butcher,Hughie Campbell,1043.0
2,Mother’s Milk,Hughie Campbell,832.0
4,Billy Butcher,Mother’s Milk,811.0
6,Mother’s Milk,Frenchie,792.0
8,Frenchie,The Female,735.0
...,...,...,...
3160,Female News Anchor,Billy Butcher,0.0
3164,The Female,Little Nina,0.0
3166,Soldier Boy,Claudio,0.0
3167,Claudio,Young Black Noir,0.0


In [49]:
shared_screentime.to_csv('shared_screentime.csv')