In [8]:
import os
import re
import pandas as pd

def extract_dialogue(script_text, character_name):
    # Match character name at the beginning of a line followed by a colon
    pattern = re.compile(rf'^{re.escape(character_name)}\s*:\s*(.*)', re.IGNORECASE)
    dialogue_lines = []

    for line in script_text.splitlines():
        line = line.strip()
        match = pattern.match(line)
        if match:
            dialogue = match.group(1)

            # Remove anything in (), [], {}, or <>
            dialogue = re.sub(r'[\(\[\{<][^)\]\}>]*[\)\]\}>]', '', dialogue)
            dialogue = dialogue.strip()

            if dialogue:
                dialogue_lines.append(dialogue)

    return dialogue_lines

# Directory containing your transcripts
transcript_dir = r"C:\Users\21dan\OneDrive\Desktop\Code\DojaCode\data_hacks_2025\Code_space\transcripts"
output_rows = []

# Loop through all .txt files
for filename in os.listdir(transcript_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(transcript_dir, filename)

        # Assume the princess name is the file name (without extension)
        princess_name = os.path.splitext(filename)[0]

        with open(file_path, "r", encoding="utf-8") as file:
            script_text = file.read()

        lines = extract_dialogue(script_text, princess_name)

        for line in lines:
            output_rows.append({
                "princess": princess_name,
                "dialogue": line
            })

# Create DataFrame and calculate word counts
df = pd.DataFrame(output_rows)
df["word_count"] = df["dialogue"].apply(lambda x: len(x.split()))
df.set_index("princess", inplace=True)

# To display grouped word counts per princess
df.groupby('princess').sum

princess
anna          1984
ariel          515
belle         1944
cinderella    1006
elsa           796
merida        1859
rapunzel      1611
snow white     798
tiana         1307
Name: word_count, dtype: int64

In [3]:
df.to_csv("transcript.csv")