In [78]:
import pandas as pd

# Read the text file
with open('data/OtGW_transcript.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Split into episodes and lines
episodes = text.split('\n\nChapter')  # Split on chapter boundaries

# Create lists to store data
char_names = []
episode_nums = []
lines = []

# Parse each episode
for episode in episodes:
    if episode.strip():  # Skip empty episodes
        lines_in_episode = episode.split('\n')
        
        # Extract episode number from first line
        if episode.startswith('Chapter'):
            episode_num = lines_in_episode[0]  # First line is full chapter heading
        else:
            episode_num = 'Chapter' + lines_in_episode[0]  # Need to add 'Chapter' back
            
        # Process each line of dialogue
        current_char = None
        current_dialogue = []
        
        for line in lines_in_episode[1:]:  # Skip episode number line
            if ':' in line:
                # If we have accumulated dialogue from previous character, save it
                if current_char and current_dialogue:
                    char_names.append(current_char)
                    episode_nums.append(episode_num.strip())
                    lines.append(' '.join(current_dialogue))
                
                # Start new character dialogue
                current_char, dialogue = line.split(':', 1)
                current_char = current_char.strip()
                current_dialogue = [dialogue.strip()]
            else:
                # Continue previous character's dialogue
                if current_char and line.strip():
                    current_dialogue.append(line.strip())
        
        # Don't forget to add the last character's dialogue
        if current_char and current_dialogue:
            char_names.append(current_char)
            episode_nums.append(episode_num.strip())
            lines.append(' '.join(current_dialogue))

# Create DataFrame
df = pd.DataFrame({
    'char_name': char_names,
    'episode': episode_nums,
    'line': lines
})


In [79]:
# Add episode numbers for each chunk of dialogue
# Episode 1: The Old Grist Mill
df.loc[0:128, 'episode'] = "Episode 1: The Old Grist Mill"

# Episode 2: Hard Times at the Huskin' Bee  
df.loc[129:306, 'episode'] = "Episode 2: Hard Times at the Huskin' Bee" 

# Episode 3: Schooltown Follies
df.loc[307:455, 'episode'] = "Episode 3: Schooltown Follies"

# Episode 4: Songs of the Dark Lantern
df.loc[456:639, 'episode'] = "Episode 4: Songs of the Dark Lantern"

# Episode 5: Mad Love
df.loc[640:817, 'episode'] = "Episode 5: Mad Love"

# Episode 6: Lullaby in Frogland
df.loc[818:975, 'episode'] = "Episode 6: Lullaby in Frogland"

# Episode 7: The Ringing of the Bell
df.loc[976:1157, 'episode'] = "Episode 7: The Ringing of the Bell"

# Episode 8: Babes in the Wood  
df.loc[1158:1261, 'episode'] = "Episode 8: Babes in the Wood"

# Episode 9: Into the Unknown
df.loc[1262:1457, 'episode'] = "Episode 9: Into the Unknown"

# Episode 10: The Unknown
df.loc[1458:1621, 'episode'] = "Episode 10: The Unknown"

# Verify the episode numbers were added correctly
print("\nNumber of rows per episode:")
print(df.groupby('episode').size())



Number of rows per episode:
episode
Episode 10: The Unknown                     164
Episode 1: The Old Grist Mill               129
Episode 2: Hard Times at the Huskin' Bee    178
Episode 3: Schooltown Follies               149
Episode 4: Songs of the Dark Lantern        184
Episode 5: Mad Love                         178
Episode 6: Lullaby in Frogland              158
Episode 7: The Ringing of the Bell          182
Episode 8: Babes in the Wood                104
Episode 9: Into the Unknown                 196
dtype: int64


### Basic Cleaning

In [80]:

df = df[~df['char_name'].str.contains('Chapter', na=False)]
df = df[~df['char_name'].str.startswith('[', na=False)]

# Remove text within square brackets from the 'line' column while keeping other text
df['line'] = df['line'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
df['line'] = df['line'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

# Remove '>>' from the start of character names
df['char_name'] = df['char_name'].str.replace(r'^>>\s*', '', regex=True)


df['char_name'] = df['char_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()
# Remove rows where 'line' is empty or null after cleaning
df = df[df['line'].notna() & (df['line'] != '')]


# Reset the index after filtering
df = df.reset_index(drop=True)


## Character Names Cleaning

In [81]:
# Print all unique character names
print("All unique character names:")
print(sorted(df['char_name'].unique()))


All unique character names:
["A deer tries to blow into a trumpet, but it's taken away", 'Adelaide', 'All', 'Angel', 'Angel Sings', 'Auntie Whispers', 'Auxiliary Committee', 'BWirt', 'Beast', 'Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Beatrice, backing away', 'Beatrice, flying', 'Beatrice, from atop a branch', 'Beatrice, from within the bush', 'Beatrice, offscreen', 'Beatrice, on the ground by the crops', "Beatrice, sitting on Greg's teapot hat", 'Beatrice, speaking to the horse', 'Beatrice, taking off', 'Both', 'Both Wirt and Beatrice', 'Braids Villager', 'Carriage Driver', 'Cherub', 'Cleaning frog crew member', 'Commitee Sings', 'Committee Number Three', 'Constable frog', 'Enoch', 'Fred the Horse', 'Frog', 'Frog Bassoon Player', 'Frog Constable', 'Frog Crewmember', 'Frog Crowd', 'Frog Lady', 'George Washington', 'George Washington sings', 'George Washingtong continues singing', 'Good Samaritan', 'Gorilla', 'Gramps Villager', 'Gr

### Wirt

In [82]:
# Print character names containing 'wirt'
print("Character names containing 'Wirt':")
print(sorted(df[df['char_name'].str.contains('wirt', case=False, na=False)]['char_name'].unique()))


Character names containing 'Wirt':
['BWirt', 'Both Wirt and Beatrice', 'Greg, joining Wirt', 'Wirt', 'Wirt & Greg', 'Wirt and Greg', 'Wirt joins in', 'Wirt whispers in dread', 'Wirt, backing away', 'Wirt, halfway out the ditch, lying in the dirt', 'Wirt, hunched over and out of breath', 'Wirt, offscreen', 'Wirt, waking from his sleep']


In [83]:
# Standardize variations of 'Wirt' to just 'Wirt'
wirt_variations = [' Wirt', 'BWirt', '>>Wirt', 'Wirt joins in', 'Wirt whispers in dread', 'Wirt, backing away', 'Wirt, halfway out the ditch, lying in the dirt', 'Wirt, hunched over and out of breath', 'Wirt, offscreen', 'Wirt, waking from his sleep']
df.loc[df['char_name'].str.contains('|'.join(wirt_variations), case=False, na=False), 'char_name'] = 'Wirt'

# Print to verify changes
print("\nUnique character names containing 'Wirt' after standardization:")
print(sorted(df[df['char_name'].str.contains('wirt', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Wirt' after standardization:
['Wirt', 'Wirt & Greg', 'Wirt and Greg']


### Greg

In [84]:
# Print character names containing 'wirt'
print("Character names containing 'Greg':")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))


Character names containing 'Greg':
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg Sings', 'Greg and Beatrice', 'Greg contines singing', 'Greg continues singing', 'Greg salutes the Opossum', 'Greg sings', 'Greg starts singing', 'Greg starts to sing', 'Greg, after thinking for a moment', 'Greg, crawling out of his ditch', 'Wirt & Greg', 'Wirt and Greg']


In [85]:
# Standardize variations of 'Greg' to just 'Greg'
greg_variations = [f'Greg (threateningly)', 'Greg starts singing', '>>Greg', 'Greg, after thinking for a moment', 'Greg, crawling out of his ditch', 'Greg (threateningly)', 'Greg (threateningly)']
df.loc[df['char_name'].str.contains('|'.join(greg_variations), case=False, na=False), 'char_name'] = 'Greg'

# Print to verify changes
print("\nUnique character names containing 'Greg' after standardization:")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Greg' after standardization:
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg Sings', 'Greg and Beatrice', 'Greg contines singing', 'Greg continues singing', 'Greg salutes the Opossum', 'Greg sings', 'Greg starts to sing', 'Wirt & Greg', 'Wirt and Greg']


  df.loc[df['char_name'].str.contains('|'.join(greg_variations), case=False, na=False), 'char_name'] = 'Greg'


### Wirt and Greg

In [86]:
# Print character names containing 'wirt'
print("Character names containing 'Wirt and Greg':")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))


Character names containing 'Wirt and Greg':
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg Sings', 'Greg and Beatrice', 'Greg contines singing', 'Greg continues singing', 'Greg salutes the Opossum', 'Greg sings', 'Greg starts to sing', 'Wirt & Greg', 'Wirt and Greg']


In [87]:
# Standardize variations of 'Greg' to just 'Greg'
greg_variations = [f'Wirt & Greg']
df.loc[df['char_name'].str.contains('|'.join(greg_variations), case=False, na=False), 'char_name'] = 'Wirt and Greg'

# Print to verify changes
print("\nUnique character names containing 'Greg' after standardization:")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Greg' after standardization:
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg Sings', 'Greg and Beatrice', 'Greg contines singing', 'Greg continues singing', 'Greg salutes the Opossum', 'Greg sings', 'Greg starts to sing', 'Wirt and Greg']


### Beatrice

In [88]:
# Print character names containing 'wirt'
print("Character names containing 'Beatrice':")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))


Character names containing 'Beatrice':
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Beatrice, backing away', 'Beatrice, flying', 'Beatrice, from atop a branch', 'Beatrice, from within the bush', 'Beatrice, offscreen', 'Beatrice, on the ground by the crops', "Beatrice, sitting on Greg's teapot hat", 'Beatrice, speaking to the horse', 'Beatrice, taking off', 'Greg and Beatrice']


In [89]:
# Standardize variations of 'Beatrice' to just 'Beatrice'
beatrice_variations = ['Beatrice, backing away', 'Beatrice, flying', 'Beatrice, from atop a branch', 'Beatrice, from within the bush', 'Beatrice, offscreen', 'Beatrice, on the ground by the crops', "Beatrice, sitting on Greg's teapot hat", 'Beatrice, speaking to the horse', 'Beatrice, taking off']
df.loc[df['char_name'].str.contains('|'.join(beatrice_variations), case=False, na=False), 'char_name'] = 'Beatrice'

# Print to verify changes
print("\nUnique character names containing 'Beatrice' after standardization:")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Beatrice' after standardization:
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Greg and Beatrice']


### Beatrice's Mom

In [90]:
# Print character names containing 'wirt'
print("Character names containing 'Beatrice's Mom':")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))


Character names containing 'Beatrice's Mom':
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Greg and Beatrice']


In [91]:
# Standardize variations of 'Beatrice' to just 'Beatrice'
beatrice_variations = ["Beatrice's Mother", "Beatrice's mom"]
df.loc[df['char_name'].str.contains('|'.join(beatrice_variations), case=False, na=False), 'char_name'] = "Beatrice's Mom"

# Print to verify changes
print("\nUnique character names containing 'Beatrice' after standardization:")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Beatrice' after standardization:
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", 'Greg and Beatrice']


## Save to CSV

In [92]:
# Save the cleaned dataframe to a CSV file
df.to_csv('data/cleaned_over_the_garden_wall_dialogue.csv', index=False)
print("Data saved to cleaned_over_the_garden_wall_dialogue.csv")


Data saved to cleaned_over_the_garden_wall_dialogue.csv
