# Analyse subtitles of Harry Potter 4

In this notebook we'll go throught Harry Potter 4 subtitles to see if we can find some entities in the text and if we can map it with characters.

In [1]:
%load_ext autoreload
%autoreload 2

## Import packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from bechdelai.nlp.process_srt import load_srt
from bechdelai.data.tmdb import get_movie_cast_from_id
from bechdelai.nlp.analyse_srt import extract_person_references_in_srt

2023-01-19 10:41:56,319 loading file C:\Users\natha\.flair\models\ner-english\4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2023-01-19 10:41:58,184 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [3]:
pd.set_option('display.max_rows', 100)

## Load data

Srt file

In [4]:
fpath = "../../../data/srt/harry_potter_4.srt"
srt_list = load_srt(fpath)

In [5]:
for srt in srt_list[:2]:
    print(srt)

1
00:01:46,125 --> 00:01:48,543
Bloody kids.

2
00:02:35,007 --> 00:02:38,676
How fastidious you've become,
Wormtail.



TMDB meta data

In [6]:
tmdb_id = "674"
cast = get_movie_cast_from_id(tmdb_id)["cast"]

In [7]:
cast_df = pd.DataFrame(cast)
cast_df.head()

Unnamed: 0,adult,gender,id,known_for_department,name,original_name,popularity,profile_path,cast_id,character,credit_id,order
0,False,2,10980,Acting,Daniel Radcliffe,Daniel Radcliffe,43.271,/iPg0J9UzAlPj1fLEJNllpW9IhGe.jpg,1,Harry Potter,52fe4268c3a36847f801c21d,0
1,False,2,10989,Acting,Rupert Grint,Rupert Grint,24.562,/q2KZZ0ltTEl7Sf8volNFV1JDEP4.jpg,2,Ron Weasley,52fe4268c3a36847f801c221,1
2,False,1,10990,Acting,Emma Watson,Emma Watson,45.967,/tvPPRGzAzdQFhlKzLbMO1EpuTJI.jpg,3,Hermione Granger,52fe4268c3a36847f801c225,2
3,False,2,1923,Acting,Robbie Coltrane,Robbie Coltrane,15.623,/jOHs3xvlwRiiG2CLtso5zzmGCXg.jpg,7,Rubeus Hagrid,52fe4268c3a36847f801c235,3
4,False,2,5469,Acting,Ralph Fiennes,Ralph Fiennes,76.417,/tJr9GcmGNHhLVVEH3i7QYbj6hBi.jpg,4,Lord Voldemort,52fe4268c3a36847f801c229,4


In [28]:
cast_characters = [c["character"].lower() for c in cast]

## Get entities and nouns

In [102]:
%%time
results = extract_person_references_in_srt(srt_list, 2)

[('kids', 1, 2, 'NOUN')]
[]
[]
[]
====
[('you', 2, 3, 'PRON'), ('Wormtail', 7, 8, 'PROPN')]
[Span[6:7]: "Wormtail" → MISC (0.4482)]
[7]
[('Wormtail', 6, 7, 'MISC')]
====
Wall time: 491 ms


In [103]:
results

Unnamed: 0,srt_id,text,start_sec,end_sec,ent,start_idx,end_idx,ent_type,gender
0,1,"How fastidious you've become,\nWormtail.",155,158,Wormtail,6,7,MISC,unknown
1,1,"How fastidious you've become,\nWormtail.",155,158,you,2,3,PRON,unknown
2,1,"How fastidious you've become,\nWormtail.",155,158,Wormtail,7,8,PROPN,unknown


In [None]:
from rapidfuzz.distance import Levenshtein
from itertools import permutations
import itertools
from string import punctuation

with open("../../../data/nlp/stop_words_english.txt", "r", encoding="utf-8") as f:
    stopwords = set(f.read().split("\n"))
    

def compute_dist_on_each_words(s1, s2):
    s1 = s1.split()
    s2 = s2.split()
    
    res = []
    for w1, w2 in itertools.product(s1, s2):
        res.append(Levenshtein.distance(w1, w2))
        
    return min(res)
    

def match_one_entity_with_cast(txt: str, cast_df: pd.DataFrame, cast_characters: pd.Series) -> pd.Series:
    
    if txt in cast_characters:
        return cast_df.loc[cast_characters == c, ["character", "gender"]].iloc[0].to_frame().T
    
    for c in cast_characters:
        if txt in c.split():
            return cast_df.loc[cast_characters == c, ["character", "gender"]].iloc[0].to_frame().T
        
    return pd.DataFrame([("", "")], columns=["character", "gender"])

def remove_stopwords_and_punctuation(txt: str) -> str:
    txt = [w for w in txt.split() if w not in stopwords]
    return " ".join(txt).translate(str.maketrans('', '', punctuation))

def match_entities_with_cast(entities_found: pd.DataFrame, cast_df: pd.DataFrame) -> pd.DataFrame:
    
    entities_found_ = entities_found.copy()
    
    # process entities text, remove stopwords, strip and to lower
    entities_found_["ent"] = entities_found_["ent"].str.lower()
    entities_found_["ent"] = entities_found_["ent"].apply(remove_stopwords_and_punctuation)
    entities_found_["ent"] = entities_found_["ent"].str.strip()
    
    
    # add char and gender columns
    entities_found_["character"] = ""
    entities_found_["gender_"] = ""
    
    # analyse remaining entities
    _filter = entities_found_.ent != ""
    
    
    cast_characters = cast_df.character.str.lower()
    ent_cast = entities_found_.loc[_filter, "ent"].apply(lambda x: match_one_entity_with_cast(x, cast_df, cast_characters))
    ent_cast = pd.concat(ent_cast.values)

    entities_found_.loc[_filter, "character"] = ent_cast.character.values
    entities_found_.loc[_filter, "gender_"] = np.where(
        ent_cast.gender.values == 1, "woman", np.where(ent_cast.gender.values == 2, "man", ""))
    # entities_found.loc[_filter, ["character", "gender"]] = ent_cast
    
    
    entities_found["character_found"] = entities_found_.character
    entities_found["gender"] = np.where(entities_found_["gender_"] != "", entities_found_["gender_"], entities_found["gender"])
    
    return entities_found

match_entities_with_cast(results, cast_df).head(50)

Unnamed: 0,srt_id,text,start_sec,end_sec,ent,start_idx,end_idx,ent_type,gender,character_found
0,1,"How fastidious you've become,\nWormtail.",155,158,Wormtail,7,8,MISC,unknown,
1,1,"How fastidious you've become,\nWormtail.",155,158,you,2,3,PRON,unknown,
2,2,"As I recall, you once called\nthe nearest gutt...",158,163,I,1,2,PRON,unknown,
3,2,"As I recall, you once called\nthe nearest gutt...",158,163,you,4,5,PRON,unknown,
4,3,Could it be that the task of nursing me\nhas b...,163,168,it,1,2,PRON,unknown,
5,3,Could it be that the task of nursing me\nhas b...,163,168,me,8,9,PRON,unknown,
6,3,Could it be that the task of nursing me\nhas b...,163,168,you,14,15,PRON,unknown,
7,4,"Oh, no. No, no, my Lord Voldemort.",168,171,Voldemort,11,12,PER,man,Lord Voldemort
8,4,"Oh, no. No, no, my Lord Voldemort.",168,171,Lord,9,10,PROPN,man,Lord Voldemort
9,4,"Oh, no. No, no, my Lord Voldemort.",168,171,Voldemort,10,11,PROPN,man,Lord Voldemort


In [23]:
len(stopwords)

851

In [9]:
results.sample(20)

Unnamed: 0,srt_id,text,start_sec,end_sec,ent,start_idx,end_idx,ent_type,gender
17,8,It cannot be done without him.\nAnd it will be...,178,182,him,6,7,PRON,man
143,94,Krum!,548,549,Krum,0,1,PROPN,unknown
1,1,"How fastidious you've become,\nWormtail.",155,158,you,2,3,PRON,unknown
129,87,It's the Irish! There's Troy!,506,508,There,5,6,PRON,unknown
135,89,Ireland! Ireland! Ireland!,516,519,Ireland,4,5,PROPN,unknown
123,85,Come on up. Take your seats. I told you\nthese...,497,502,I,8,9,PRON,unknown
140,93,Krum! Krum! Krum!,540,543,Krum,0,1,PROPN,unknown
50,28,"- Ron, where are we actually going?\n- Don't k...",259,262,we,5,6,PRON,unknown
85,53,"I'll bet that cleared your sinuses, eh?",357,359,I,0,1,PRON,unknown
90,63,"- Cedric.\n- Ced, come on.",399,400,Cedric,2,3,PER,unknown


TODO :
- Match with character name OK

- Handle idx problem
- remove other notebooks 
- remove subtitles.py
- remove unused function
- get path syn dict from path
- documentaiton
- Test ?