In [None]:
import glob
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import nltk
import json
import stanza
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from itertools import combinations
from collections import Counter
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to /Users/germa/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/germa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [8]:
names_df = pd.read_csv('disney_names_and_genders.csv', header=0)
names_df['Name'] = names_df['Name'].str.lower()
names_df['Gender'] = names_df['Gender'].map({'M': 'male', 'F': 'female'})
names_df.head(3)

Unnamed: 0.1,Unnamed: 0,Name,Gender
0,0,.giffany,female
1,1,22,female
2,2,3rd street bandits,male


In [9]:
movies_path = "movies/*.txt"
files = glob.glob(movies_path)

In [None]:
nlp = stanza.Pipeline(lang="en", processors="tokenize,pos,lemma,ner")
stop_words = set(stopwords.words("english"))

2025-11-17 10:04:04 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 43.4MB/s]                    
2025-11-17 10:04:05 INFO: Downloaded file to /Users/germa/stanza_resources/resources.json
2025-11-17 10:04:05 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| pos       | combined_charlm           |
| lemma     | combined_nocharlm         |
| ner       | ontonotes-ww-multi_charlm |

2025-11-17 10:04:05 INFO: Using device: cpu
2025-11-17 10:04:05 INFO: Loading: tokenize
2025-11-17 10:04:05 INFO: Loading: mwt
2025-11-17 10:04:05 INFO: Loading: pos
2025-11-17 10

In [11]:
def fix_gpe(tokens, entities):
    """
    tokens: list of words in a sentence
    entities: list/set of (entity_text, entity_type)
    Returns:
        - new token list with multi-word entities joined by '-'
        - updated set of entities including merged versions
    """
    tokens = [tok for tok in tokens if tok is not None]
    text_str = " ".join(tokens)
    merged_entities = set()

    for entity_text, ent_type in entities:
        if " " in entity_text:  # only merge multi-word entities
            merged_text = entity_text.replace(" ", "-")
            pattern = r'\b' + re.escape(entity_text) + r'\b'
            text_str = re.sub(pattern, merged_text, text_str)
            merged_entities.add((merged_text, ent_type))
        else:
            merged_entities.add((entity_text, ent_type))

    new_tokens = text_str.split(" ")
    return new_tokens, merged_entities


In [13]:
def get_gender(name):

    # check if name is not an empty string
    if name is None:
        return 'Unknown'

    try:
        name = name.lower().strip()
        if name == '':
            return 'Unknown'
    
        # if exact match is found
        gender = names_df.loc[names_df['Name'] == name]['Gender'].values[0]

    except:
        # else try to split name and find parts
        genders = []
        name_parts = name.lower().split(' ')
        for part in name_parts:
            if part != '':
                # if part of the name is found in any of the rows, get name and gender
                gender = names_df[names_df.Name.str.contains(part)]['Gender'].to_list()
                genders += gender
        # count gender occurrences 
        gender_counts = Counter(genders)
        if len(gender_counts.keys()) == 1:
            gender = list(gender_counts.keys())[0]
        else:
            if len(gender_counts.keys()) > 1:
                gender = sorted(gender_counts.items(), key=lambda x: x[1], reverse=True)[0][0]
            else:
                # length of unq genders is 0
                gender = 'Unknown'
    
    return gender


In [12]:
def clean_token(token):
    if token is None:
        return None
    # Remove everything except letters, numbers, and basic accented characters
    token = re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ0-9]+", "", token)
    return token.strip()

In [24]:
pattern = r"PLOT:\s*(.*)"  
window_size = 2
pairs = set()
pos_tags = {}

for file_idx, file_path in enumerate(files):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    match = re.search(pattern, text, re.S)

    if match:
        plot = match.group(1).strip()
        doc = nlp(plot)
        # Collect all GPE entities with type
        # entities = set((ent.text, ent.type) for ent in doc.ents if ent.type =="GPE")
        person_or_gpe = []
        for ent in doc.ents:
            if ent.type == "PERSON":
                # Take only the first token of the PERSON entity
                first_word = re.split(r"['\-]", ent.text.split()[0])[0]
                person_or_gpe.append((first_word, ent.type))
            elif ent.type == "GPE":
                # For GPE, keep full entity
                person_or_gpe.append((ent.text, ent.type))
        entities = list(set(person_or_gpe))

        for sent in doc.sentences:
            sent = sent.to_dict()
            sent_tokens = []
            for word in sent:
                lemma = word.get('lemma')
                lemma = clean_token(lemma)
                pos = word.get('upos')
                ner = word.get('ner')

                #detect gender of beginning of person names or single token person names
                if ner in {"B-PERSON", "S-PERSON"}:
                    gender = get_gender(lemma)
                else:
                    gender = None

                # Filter tokens
                if pos not in {"PUNCT", "PART", "I-PERSON"} and lemma not in stop_words and pos_tags.get(lemma) is None:
                    sent_tokens.append(lemma) 
                    if lemma is not None:
                        pos_tags[lemma.lower()] = {"pos": pos, "gender": gender}

            sent_tokens, entities_merged = fix_gpe(sent_tokens, entities)
     
            for ent, pos in entities_merged:
                gender_ent = get_gender(ent) if pos == "PERSON" else None
                pos_tags[ent.lower()] = {"pos": pos, "gender": gender_ent}
            
            n = len(sent_tokens)
            for i in range(n):
                window = sent_tokens[i : i + window_size]
                for w1, w2 in combinations(window, 2):
                    pairs.add((w1, w2))
    print(f"file {file_idx} completed: {file_path}")

file 0 completed: movies/Jungle 2 Jungle.txt
file 1 completed: movies/The Princess and the Frog.txt
file 2 completed: movies/The Ice Age Adventures of Buck Wild.txt
file 3 completed: movies/Beverly Hills Chihuahua 2.txt
file 4 completed: movies/Secretariat (film).txt
file 5 completed: movies/Eight Below.txt
file 6 completed: movies/Planes: Fire & Rescue.txt
file 7 completed: movies/Ratatouille (film).txt
file 8 completed: movies/Diary of a Wimpy Kid: Rodrick Rules (2022 film).txt
file 9 completed: movies/The Fox and the Hound.txt
file 10 completed: movies/Luca (2021 film).txt
file 11 completed: movies/The Adventures of Bullwhip Griffin.txt
file 12 completed: movies/The Straight Story.txt
file 13 completed: movies/Squanto: A Warrior's Tale.txt
file 14 completed: movies/Brother Bear.txt
file 15 completed: movies/Snow White and the Seven Dwarfs (1937 film).txt
file 16 completed: movies/One Little Indian (film).txt
file 17 completed: movies/The Three Caballeros.txt
file 18 completed: movie

In [25]:
# ---- Save pairs ----
with open("cooccurrence_pairs_window2.txt", "w", encoding="utf-8") as f:
    for w1, w2 in pairs:
        f.write(f"{w1}\t{w2}\n")

# ---- Save POS tags ----
with open("pos_tags.json", "w", encoding="utf-8") as f:
    json.dump(pos_tags, f, indent=2)