# Bechdel test on subtitles

In [1]:
%load_ext autoreload
%autoreload 2

## Import packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import chain
from itertools import permutations

In [3]:
import spacy
import neuralcoref

In [4]:
from flair.data import Sentence
from flair.models import SequenceTagger

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
pd.set_option('display.max_rows', 100)

## Get data

- Use flair for NER ==> identify PER 
    https://huggingface.co/flair/ner-english-large?text=George+Washington+went+to+Washington
- Use neuralcoref for pronoun corresp ==> identify if talking about a person
    https://github.com/huggingface/neuralcoref
- mapping of entities (clustering)
- Map character gender
    - with neuralcoref output to rerun it
    - Use TMDB to map character and gender (apriori)

- identification when a person is mentioned in a sentence (pronoun)
- Correspondence with gender

- Missing : who speak ? which scene ?

## Test neuralcoref

In [6]:
nlp = spacy.load('en_core_web_sm')

In [8]:
# Let's try before using the conversion dictionary:
neuralcoref.add_to_pipe(nlp)
doc = nlp(u'Deepika has a dog. She loves him. The movie star has always been fond of animals')
doc._.coref_clusters
# >>> [Deepika: [Deepika, She, him, The movie star]]
doc._.coref_resolved
# >>> 'Deepika has a dog. Deepika loves Deepika. Deepika has always been fond of animals'

'Deepika has a dog. Deepika loves Deepika. Deepika has always been fond of animals'

In [9]:
# Here are three ways we can add the conversion dictionary
nlp.remove_pipe("neuralcoref")
neuralcoref.add_to_pipe(nlp, conv_dict={'Deepika': ['woman', 'actress']})
# or
nlp.remove_pipe("neuralcoref")
coref = neuralcoref.NeuralCoref(nlp.vocab, conv_dict={'Deepika': ['woman', 'actress']})

In [10]:
nlp.add_pipe(coref, name='neuralcoref')
# or after NeuralCoref is already in SpaCy's pipe, by modifying NeuralCoref in the pipeline
nlp.get_pipe('neuralcoref').set_conv_dict({'Deepika': ['woman', 'actress']})

# Let's try agin with the conversion dictionary:
doc = nlp(u'Deepika has a dog. She loves him. The movie star has always been fond of animals')
doc._.coref_clusters
# >>> [Deepika: [Deepika, She, The movie star], a dog: [a dog, him]]
# >>> 'Deepika has a dog. Deepika loves a dog. Deepika has always been fond of animals'
# >>> A lot better!

[Deepika: [Deepika, She, The movie star], a dog: [a dog, him]]

## Test flair

In [7]:
%%time
# load the NER tagger
tagger = SequenceTagger.load('ner')

2022-05-24 21:05:37,475 loading file C:\Users\natha\.flair\models\ner-english\4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-05-24 21:05:40,607 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
CPU times: total: 5.47 s
Wall time: 1min 23s


In [109]:
s = "I love Daft Punk! This is the best music band of the world" 

In [128]:
# make a sentence
sentence = Sentence(s)

# run NER over sentence
tagger.predict(sentence)

# iterate over entities and print each
for entity in sentence.get_spans('ner'):
    print(entity)
    print(entity.start_position, entity.end_position)    
    print(entity.tag)
    print(entity.text)  
    print(entity.tokens)
    print() 

Span[2:5]: "Daft Punk !" → MISC (0.7194)
7 17
MISC
Daft Punk !
[Token[2]: "Daft", Token[3]: "Punk", Token[4]: "!"]
2 5


## Load data

In [8]:
from subtitles import load_srt, format_srt, get_dist_between_2_srt

In [9]:
fpath = "data/Femme Fatale 2002 [1080pBR-Remux-playBD] Forced.srt"
fpath = "C:/Users/natha/Downloads/Mulan.1998.Disney.Classics.Timeless.Collection.1080p.BluRay.x264-OPUSLAW.hi.srt"

In [10]:
srt_list = load_srt(fpath)

In [11]:
def create_txt_blocks(srt_list, block_gap=2):
    prev_t = srt_list[0].end
    txt = format_srt(srt_list[0].text)
    blocks = []
    
    for i, srt in enumerate(srt_list[1:]):
        txt_ = format_srt(srt.text)
        t_gap = get_dist_between_2_srt(prev_t, srt.start)
        prev_t = srt.end
        
        if (t_gap > block_gap) or (i == len(srt_list) - 1):
            blocks.append(f"{txt} {txt_}")
            txt = ""
        
        else:
            txt = f"{txt} {txt_}"
            
    blocks = map(lambda x: x.strip().replace("  ", " "), blocks)
    blocks = list(filter(lambda x: x != "", blocks))
        
    return blocks  

In [14]:
srt_blocks = create_txt_blocks(srt_list, block_gap=0)

In [16]:
# for srt in srt_blocks:
#     print(srt)
    

## Explore pipeline

In [13]:
neuralcoref.add_to_pipe(nlp)
# doc = nlp(u'Deepika has a dog. She loves him. The movie star has always been fond of animals')
# doc._.coref_clusters
# >>> [Deepika: [Deepika, She, him, The movie star]]
# doc._.coref_resolved

<spacy.lang.en.English at 0x1f4bb9d9910>

In [65]:
s = "Fa Li, is your daughter here yet?"
s = 'please help Mulan impress the Matchmaker today.'
s = "please help Mulan! She impressed the Matchmaker today."

# make a sentence
sentence = Sentence(s)

# run NER over sentence
tagger.predict(sentence)
res = nlp(s)

https://www.dictionary.com/browse/
- person
- female, male

In [138]:
def merge_if_in(elements):
    delete_i = []
    for i,j in permutations(range(len(elements)), 2):
        if i in delete_i:
            continue
        e1 = elements[i]
        e2 = elements[j]
        
        start1, end1 = e1[1], e1[2]
        start2, end2 = e2[1], e2[2]
        
        if (start1 <= start2 <= end1) & (start1 <= end2 <= end1):
            delete_i.append(j)
    
    return [e for (i, e) in enumerate(elements) if i not in delete_i]
    
def extract_keyword(txt):
    # make a sentence
    sentence = Sentence(txt)

    # run NER over sentence
    tagger.predict(sentence)
    
    # Run spacy en + neuralcoref
    doc = nlp(txt)
    
    vocab = list(
        map(
            lambda x: (x.text, x.start, x.end, x.root.pos_), # x.root.pos_
            doc._.coref_scores.keys()
        )
    )
    vocab = list(
        filter(
            lambda x: x[3] in ["NOUN", "PRON", "PROPN"], vocab
        )
    )
    entities = list(
        filter(
            lambda x: x.tag in ["PER", "MISC"], 
            sentence.get_spans('ner')
        )
    )
    entities = [
        (e.text, e.tokens[0].idx - 1, e.tokens[-1].idx, e.tag) for e in entities
    ] 
    entities += vocab
    entities = merge_if_in(entities)
    
    return entities

In [140]:
%%time
entities = []
for srt in srt_blocks:
    entities.append(extract_keyword(srt))

CPU times: total: 39min 21s
Wall time: 4min 56s


In [141]:
len(entities)

967

In [143]:
entities

[[('We', 0, 1, 'PRON'),
  ('attack', 3, 4, 'NOUN'),
  ('the signal', 6, 8, 'NOUN')],
 [('the fire', 1, 3, 'NOUN'), ('the signal', 6, 8, 'NOUN')],
 [('China', 3, 4, 'PROPN')],
 [],
 [('Your Majesty', 0, 2, 'PROPN'),
  ('the Huns', 3, 5, 'NOUN'),
  ('our northern border', 7, 10, 'NOUN')],
 [('No one', 2, 4, 'NOUN'), ('the Great Wall', 7, 10, 'PROPN')],
 [('Shan-Yu', 0, 3, 'PROPN'), ('them', 5, 6, 'PRON')],
 [('We', 0, 1, 'PRON'), ('defenses around your palace', 4, 8, 'NOUN')],
 [('Chi Fu', 9, 11, 'PER'),
  ('your troops', 3, 5, 'NOUN'),
  ('my people', 7, 9, 'NOUN'),
  ('Chi Fu', 10, 12, 'PROPN')],
 [('Your Highness ?', 2, 5, 'MISC')],
 [('conscription notices', 1, 3, 'NOUN'), ('all the provinces', 4, 7, 'NOUN')],
 [('reserves', 2, 3, 'NOUN'),
  ('many new recruits as possible', 5, 10, 'NOUN')],
 [('Your Majesty', 3, 5, 'PROPN'),
  ('my troops', 9, 11, 'NOUN'),
  ('him', 13, 14, 'PRON')],
 [('any chances', 4, 6, 'NOUN'), ('General', 7, 8, 'PROPN')],
 [('A single grain of rice', 0, 5, 'NO

In [142]:
# for txt in srt_blocks:
#     # make a sentence
#     sentence = Sentence(txt)

#     # run NER over sentence
#     tagger.predict(sentence)

#     entities = list(filter(lambda x: x.tag in ["PER", "MISC"], sentence.get_spans('ner')))
#     res = nlp(txt)
    
#     propn = list(
#         map(
#             lambda x: [
#                 (c, c.root.pos_) 
#                 for c 
#                 in filter(
#                     lambda y: y.root.pos_.startswith("PRO"), x.noun_chunks
#                 )
#             ], 
#             res._.coref_scores.keys()
#         )
#     )
#     propn = list(chain(*propn))
    
#     print(txt)
#     print(res._.coref_clusters)
#     print(propn)
#     print(entities)
    
#     print()
    
    
#     # iterate over entities and print each
#     # for entity in entities:
#         # print(sentence)
#         # print(entity)
#         # print(entity.start_position, entity.end_position)    
#         # print(entity.tag)
#         # print(entity.text)    
    
    