In [None]:
import glob
import re
from nltk.corpus import stopwords
import nltk
import stanza
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from itertools import combinations
from collections import Counter
import pandas as pd
import json
import string

In [None]:
names_df = pd.read_csv('disney_names_and_genders.csv', header=0)
names_df['Name'] = names_df['Name'].str.lower()
names_df['Gender'] = names_df['Gender'].map({'M': 'male', 'F': 'female'})

In [None]:
movies_path = "movies/*.txt"
files = glob.glob(movies_path)

In [None]:
nlp = stanza.Pipeline(lang="en", processors="tokenize,pos,lemma,ner")
stop_words = set(stopwords.words("english"))

In [None]:
def fix_gpe(tokens, entities):
    """
    tokens: list of words in a sentence
    entities: list/set of (entity_text, entity_type)
    Returns:
        - new token list with multi-word entities joined by '-'
        - updated set of entities including merged versions
    """
    tokens = [tok for tok in tokens if tok is not None]
    text_str = " ".join(tokens)
    merged_entities = set()

    for entity_text, ent_type in entities:
        if " " in entity_text:  # only merge multi-word entities
            merged_text = entity_text.replace(" ", "-")
            pattern = r'\b' + re.escape(entity_text) + r'\b'
            text_str = re.sub(pattern, merged_text, text_str)
            merged_entities.add((merged_text, ent_type))
        else:
            merged_entities.add((entity_text, ent_type))

    new_tokens = text_str.split(" ")
    return new_tokens, merged_entities


In [None]:
def get_gender(name):
    # check if name is not an empty string
    if name is None:
        return 'Unknown'

    try:
        name = name.lower().strip()
        if name == '':
            return 'Unknown'
    
        # if exact match is found
        gender = names_df.loc[names_df['Name'] == name]['Gender'].values[0]

    except:
        # else try to split name and find parts
        genders = []
        name_parts = name.lower().split(' ')
        for part in name_parts:
            if part != '':
                # if part of the name is found in any of the rows, get name and gender
                gender = names_df[names_df.Name.str.contains(part)]['Gender'].to_list()
                genders += gender
        # count gender occurrences 
        gender_counts = Counter(genders)
        if len(gender_counts.keys()) == 1:
            gender = list(gender_counts.keys())[0]
        else:
            if len(gender_counts.keys()) > 1:
                gender = sorted(gender_counts.items(), key=lambda x: x[1], reverse=True)[0][0]
            else:
                # length of unq genders is 0
                gender = 'Unknown'
    
    return gender


In [None]:
def clean_token(token):
    if token is None:
        return None
    # Remove everything except letters, numbers, and basic accented characters
    token = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ0-9]+", "", token)
    return token.strip()

In [None]:
def find_window_words(tokens, target_word, window_size=5):
    indices = [i for i, token in enumerate(tokens) if token == target_word]
    window_words = set()
    for index in indices:
        start = max(0, index - window_size)
        end = min(len(tokens), index + window_size + 1)
        for i in range(start, end):
            if i != index:
                window_words.add(tokens[i])
    return window_words


In [None]:
def merge_person_tokens(tokens, person_names):
    """
    Merge multi-word person names in a token list into single tokens with dash.
    tokens: list of token strings
    person_names: list or set of full PERSON entity strings
    """
    merged_tokens = tokens.copy()
    
    for name in person_names:
        words = name.split()  # ["Snow", "White"]
        i = 0
        while i <= len(merged_tokens) - len(words):
            # check if consecutive tokens match the entity
            if merged_tokens[i:i+len(words)] == words:
                # merge them with a dash
                merged_tokens[i:i+len(words)] = ["-".join(words)]
                i += 1  # skip past merged token
            else:
                i += 1
    return merged_tokens


In [None]:
pattern = r"PLOT:\s*(.*)"
female_pairs = set()
pos_tags = {}
male_pairs = set()     
mixed_pairs = set()      

for file_idx, file_path in enumerate(files):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    match = re.search(pattern, text, re.S)
    if not match:
        print(f"file {file_idx} skipped (no PLOT): {file_path}")
        continue

    plot = match.group(1).strip()
    doc = nlp(plot)
    
    persons = set()
    gpe = set()
    names = set()

    #we check for entities (PERSON and GPE) in the whole plot
    for ent in doc.ents:
        if ent.type == "PERSON":
            pers = ent.text.lower() 
            names.add(pers) #names of people before concatenation of double names "snow white"
            gender = get_gender(pers)
            concat_pers = "-".join(pers.split()) #snow-white
            if gender in ['female', 'male']: #to avoid unknowns
                if concat_pers not in persons:
                    persons.add((concat_pers, ent.type, gender)) 
                    pos_tags[concat_pers.lower()] = {'pos': ent.type, 'gender': gender}
        elif ent.type == "GPE":
            gpe.add((ent.text, ent.type))

    female = [person for person,_,gender in persons if gender=='female']
    male = [person for person,_,gender in persons if gender=='male']
    
    #we go through each sentence and we keep only relevant tokens's lemmas and pos.
    for sent in doc.sentences: 
        sent_tokens = [] #list of relevant tokens to find surrounding words
        for token in sent.tokens:
            word = token.words[0] #dict with information on the token
            text = word.text.lower() #the actual word
            pos = word.pos #part-of-speech
            lemma = clean_token(word.lemma.lower())
            upos = word.upos 

            if text not in stop_words and text not in string.punctuation and upos not in ["PART", "PUNCT", "NUM", "DET"]:
                sent_tokens.append(text)
                if text not in pos_tags:
                    pos_tags[text]= {'pos': pos, 'gender': None}

        sent_tokens = merge_person_tokens(sent_tokens, names) #we merge tokens such as "snow white" to "snow-white" to match persons.
        
        tokenized_text, merged_entities = fix_gpe(sent_tokens, gpe) #we merge token such as "new york city" to new-your-city
        for entity, pos in merged_entities:
            if entity not in pos_tags:
                pos_tags[entity.lower()] = {'pos': pos, 'gender': None}
        
        #for each person we find the previous and next 5 words
        for (person, _, gender) in persons: 
            surrounding_words = find_window_words(tokenized_text, person,5) 
            
            gender_to_check = female if gender =="male" else male
            same_gender = True
            for name in gender_to_check:
                if name in surrounding_words:
                    same_gender = False

            if surrounding_words:
                surrounding_words.add(person)
            #we create combinatios of all the words in the surrounding window
            combs = combinations(surrounding_words, 2)
            #if the opposite gender of the person is in the surrounding words, we add the pairs to mixed
            for pair in combs:
                if gender == "male":
                    if same_gender:
                        male_pairs.add(pair)
                    else:
                        mixed_pairs.add(pair)
                if gender == "female":
                    if same_gender:
                        female_pairs.add(pair)
                    else:
                        mixed_pairs.add(pair)
  
    print(f'file {file_idx} completed')

In [None]:
#Save files
with open("male_pairs.txt", "w", encoding="utf-8") as f:
    for w1, w2 in male_pairs:
        f.write(f"{w1},{w2}\n")

with open("female_pairs.txt", "w", encoding="utf-8") as f:
    for w1, w2 in female_pairs:
        f.write(f"{w1},{w2}\n")

with open("mixed_pairs.txt", "w", encoding="utf-8") as f:
    for w1, w2 in mixed_pairs:
        f.write(f"{w1},{w2}\n")

# Save POS tags 
with open("pos_tags.json", "w", encoding="utf-8") as f:
    json.dump(pos_tags, f, indent=2)