In [None]:
import glob
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import nltk
import json
import stanza
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from itertools import combinations
import gender_guesser.detector as gender

In [None]:
movies_path = "movies/*.txt"
files = glob.glob(movies_path)

In [None]:
nlp = stanza.Pipeline(lang="en", processors="tokenize,pos,lemma,ner")
stop_words = set(stopwords.words("english"))
d = gender.Detector()

In [None]:
def fix_gpe(tokens, entities):
    """
    tokens: list of words in a sentence
    entities: list/set of (entity_text, entity_type)
    Returns:
        - new token list with multi-word entities joined by '-'
        - updated set of entities including merged versions
    """
    tokens = [tok for tok in tokens if tok is not None]
    text_str = " ".join(tokens)
    merged_entities = set()

    for entity_text, ent_type in entities:
        if " " in entity_text:  # only merge multi-word entities
            merged_text = entity_text.replace(" ", "-")
            pattern = r'\b' + re.escape(entity_text) + r'\b'
            text_str = re.sub(pattern, merged_text, text_str)
            merged_entities.add((merged_text, ent_type))
        else:
            merged_entities.add((entity_text, ent_type))

    new_tokens = text_str.split(" ")
    return new_tokens, merged_entities


In [None]:
pattern = r"PLOT:\s*(.*)"  
window_size = 2
pairs = set()
pos_tags = {}

for file_idx, file_path in enumerate(files):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    match = re.search(pattern, text, re.S)

    if match:
        plot = match.group(1).strip()
        doc = nlp(plot)
        # Collect all GPE entities with type
        # entities = set((ent.text, ent.type) for ent in doc.ents if ent.type =="GPE")
        person_or_gpe = []
        for ent in doc.ents:
            if ent.type == "PERSON":
                # Take only the first token of the PERSON entity
                first_word = re.split(r"['\-]", ent.text.split()[0])[0]
                person_or_gpe.append((first_word, ent.type))
            elif ent.type == "GPE":
                # For GPE, keep full entity
                person_or_gpe.append((ent.text, ent.type))
        entities = list(set(person_or_gpe))

        for sent in doc.sentences:
            sent = sent.to_dict()
            sent_tokens = []
            for word in sent:
                lemma = word.get('lemma')
                if lemma is not None:
                    lemma = lemma.lstrip('-')
                pos = word.get('upos')
                ner = word.get('ner')

                #detect gender of beginning of person names or single token person names
                if ner in {"B-PERSON", "S-PERSON"}:
                    gender = d.get_gender(lemma)
                else:
                    gender = None

                # Filter tokens
                if pos not in {"PUNCT", "PART", "I-PERSON"} and lemma not in stop_words:
                    sent_tokens.append(lemma) 
                    pos_tags[lemma] = {"pos": pos, "gender": gender}

            sent_tokens, entities_merged = fix_gpe(sent_tokens, entities)
     
            for ent, pos in entities_merged:
                gender_ent = d.get_gender(ent) if pos == "PERSON" else None
                pos_tags[ent] = {"pos": pos, "gender": gender_ent}
            
            n = len(sent_tokens)
            for i in range(n):
                window = sent_tokens[i : i + window_size]
                for w1, w2 in combinations(window, 2):
                    pairs.add((w1, w2))

    print(f"file {file_idx} completed: {file_path}")

In [None]:
# ---- Save pairs ----
with open("cooccurrence_pairs_window2.txt", "w", encoding="utf-8") as f:
    for w1, w2 in pairs:
        f.write(f"{w1}\t{w2}\n")

# ---- Save POS tags ----
with open("pos_tags.json", "w", encoding="utf-8") as f:
    json.dump(pos_tags, f, indent=2)