In [270]:
import pandas as pd

# Read the text file
with open('data/OtGW_transcript.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Split into episodes and lines
episodes = text.split('\n\nChapter')  # Split on chapter boundaries

# Create lists to store data
char_names = []
episode_nums = []
lines = []

# Parse each episode
for episode in episodes:
    if episode.strip():  # Skip empty episodes
        lines_in_episode = episode.split('\n')
        
        # Extract episode number from first line
        if episode.startswith('Chapter'):
            episode_num = lines_in_episode[0]  # First line is full chapter heading
        else:
            episode_num = 'Chapter' + lines_in_episode[0]  # Need to add 'Chapter' back
            
        # Process each line of dialogue
        current_char = None
        current_dialogue = []
        
        for line in lines_in_episode[1:]:  # Skip episode number line
            if ':' in line:
                # If we have accumulated dialogue from previous character, save it
                if current_char and current_dialogue:
                    char_names.append(current_char)
                    episode_nums.append(episode_num.strip())
                    lines.append(' '.join(current_dialogue))
                
                # Start new character dialogue
                current_char, dialogue = line.split(':', 1)
                current_char = current_char.strip()
                current_dialogue = [dialogue.strip()]
            else:
                # Continue previous character's dialogue
                if current_char and line.strip():
                    current_dialogue.append(line.strip())
        
        # Don't forget to add the last character's dialogue
        if current_char and current_dialogue:
            char_names.append(current_char)
            episode_nums.append(episode_num.strip())
            lines.append(' '.join(current_dialogue))

# Create DataFrame
df = pd.DataFrame({
    'char_name': char_names,
    'episode': episode_nums,
    'line': lines
})


In [271]:
# Add episode numbers for each chunk of dialogue
# Episode 1: The Old Grist Mill
df.loc[0:128, 'episode'] = "Episode 1: The Old Grist Mill"

# Episode 2: Hard Times at the Huskin' Bee  
df.loc[129:306, 'episode'] = "Episode 2: Hard Times at the Huskin' Bee" 

# Episode 3: Schooltown Follies
df.loc[307:455, 'episode'] = "Episode 3: Schooltown Follies"

# Episode 4: Songs of the Dark Lantern
df.loc[456:639, 'episode'] = "Episode 4: Songs of the Dark Lantern"

# Episode 5: Mad Love
df.loc[640:817, 'episode'] = "Episode 5: Mad Love"

# Episode 6: Lullaby in Frogland
df.loc[818:975, 'episode'] = "Episode 6: Lullaby in Frogland"

# Episode 7: The Ringing of the Bell
df.loc[976:1157, 'episode'] = "Episode 7: The Ringing of the Bell"

# Episode 8: Babes in the Wood  
df.loc[1158:1261, 'episode'] = "Episode 8: Babes in the Wood"

# Episode 9: Into the Unknown
df.loc[1262:1457, 'episode'] = "Episode 9: Into the Unknown"

# Episode 10: The Unknown
df.loc[1458:1621, 'episode'] = "Episode 10: The Unknown"

# Verify the episode numbers were added correctly
print("\nNumber of rows per episode:")
print(df.groupby('episode').size())



Number of rows per episode:
episode
Episode 10: The Unknown                     164
Episode 1: The Old Grist Mill               129
Episode 2: Hard Times at the Huskin' Bee    178
Episode 3: Schooltown Follies               149
Episode 4: Songs of the Dark Lantern        184
Episode 5: Mad Love                         178
Episode 6: Lullaby in Frogland              158
Episode 7: The Ringing of the Bell          182
Episode 8: Babes in the Wood                104
Episode 9: Into the Unknown                 196
dtype: int64


### Basic Cleaning

In [272]:

df = df[~df['char_name'].str.contains('Chapter', na=False)]
df = df[~df['char_name'].str.startswith('[', na=False)]

# Remove text within square brackets from the 'line' column while keeping other text
df['line'] = df['line'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
df['line'] = df['line'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

# Remove '>>' from the start of character names
df['char_name'] = df['char_name'].str.replace(r'^>>\s*', '', regex=True)


df['char_name'] = df['char_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()
# Remove rows where 'line' is empty or null after cleaning
df = df[df['line'].notna() & (df['line'] != '')]


# Reset the index after filtering
df = df.reset_index(drop=True)


## Character Names Cleaning

In [273]:
# Print all unique character names
print("\nAll unique character names:")
print(*sorted(df['char_name'].unique()), sep="\n")



All unique character names:
A deer tries to blow into a trumpet, but it's taken away
Adelaide
All
Angel
Angel Sings
Auntie Whispers
Auxiliary Committee
BWirt
Beast
Beatrice
Beatrice & Fred the Horse
Beatrice's Mom
Beatrice's Mother
Beatrice's mom
Beatrice, backing away
Beatrice, flying
Beatrice, from atop a branch
Beatrice, from within the bush
Beatrice, offscreen
Beatrice, on the ground by the crops
Beatrice, sitting on Greg's teapot hat
Beatrice, speaking to the horse
Beatrice, taking off
Both
Both Wirt and Beatrice
Braids Villager
Carriage Driver
Cherub
Cleaning frog crew member
Commitee Sings
Committee Number Three
Constable frog
Enoch
Fred the Horse
Frog
Frog Bassoon Player
Frog Constable
Frog Crewmember
Frog Crowd
Frog Lady
George Washington
George Washington sings
George Washingtong continues singing
Good Samaritan
Gorilla
Gramps Villager
Gramps Villager, offscreen
Greg
Greg Sings
Greg and Beatrice
Greg contines singing
Greg continues singing
Greg salutes the Opossum
Greg sings

In [274]:
# Print all lines where character name contains 'villager'
print("\nLines with 'villager' in character name:")
villager_lines = df[df['char_name'].str.contains('student', case=False, na=False)]
print(villager_lines[['char_name', 'line']].to_string())



Lines with 'villager' in character name:
     char_name                        line
1292   Student           STOP! YOU HIT ME!
1302   Student         WHAT ARE YOU DOING?
1303   Student                      JASON.
1304   Student                  OH, JASON.
1312   Student                 LITTLE GUY.
1315   Student                THERE HE IS.
1316   Student  WIRT, WE CAN SEE YOU, MAN.


In [275]:
# Standardize variations of Misc Characters
variations = ["A deer tries to blow into a trumpet, but it's taken away"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'School House Deer'


# Change 'All' to specific crowd names based on episode number
df.loc[(df['char_name'] == 'All') & (df['episode'] == 'Episode 4: Songs of the Dark Lantern'), 'char_name'] = 'Tavern Crowd'
df.loc[(df['char_name'] == 'All') & (df['episode'] == 'Episode 6: Lullaby in Frogland'), 'char_name'] = 'Frog Crowd' 
df.loc[(df['char_name'] == 'All' ) & (df['episode'] == 'Episode 8: Babes in the Wood'), 'char_name'] = 'Cloud Crowd'

variations = ["Angel", "Angel Sings"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Angel'

df.loc[(df['char_name'] == 'Both') & (df['episode'] == 'Episode 6: Lullaby in Frogland'), 'char_name'] = 'Wirt and Greg'
df.loc[(df['char_name'] == 'Both') & (df['episode'] == 'Episode 7: The Ringing of the Bell') & (df.index < 1030), 'char_name'] = 'Wirt and Lorna'
df.loc[(df['char_name'] == 'Both') & (df['episode'] == 'Episode 7: The Ringing of the Bell') & (df.index > 1030), 'char_name'] = 'Wirt and Greg'    
df.loc[(df['char_name'] == 'Both') & (df['episode'] == 'Episode 9: Into the Unknown'), 'char_name'] = 'Kathleen and Rhondi'

variations = ["Braids Villager", "Pilgrim Pumpkin", "Pilgrim Villager", "Gramps Villager", "Villager"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Pottsfield Villager'



variations = ["Frog Lady"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Frog Passenger'

variations = ["Constable frog", "Frog Constable", "Cleaning frog crew member"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Frog Crewmember'

variations = ["Frog", "George Washington", "George Washington sings"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = '(Frog) Jason Funderberker'

variations = ["Good Samaritan"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Concert Guest'

variations = ["High School Boy"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Partygoer'

variations = ["Jason Funderberker", "Jason F", "Jason"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False) & ~df['char_name'].str.contains('Frog', case=False, na=False), 'char_name'] = 'Jason Funderberker'

variations = ["Miss Langtree starts to sing", "Miss Langtree, now on the floor and in lament, continues singing", "Miss. Langtree"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Miss Langtree'

variations = ["Mr. Langtree begins to take the instruments from the animals"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Mr. Langtree'

variations = ["Reception committee"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Reception Committee'

variations = ["Tavern Lady, modeled after Betty Boop", "The Tavern Lady starts to sing"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Tavern Lady'

variations = ["The Highwayman starts to sing"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'The Highwayman'

variations = ["The Old North Wind SIngs"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Old North Wind'

variations = ["The Toy Maker starts to sing", "The Toy Maker"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'The Toymaker'

variations = ["Unknown Voice", "Beast"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'The Beast'

variations = ["Student"]
df.loc[df['char_name'].str.contains('|'.join(variations), case=False, na=False), 'char_name'] = 'Student at Graveyard'

# Print lines where character name is 'All'
print("\nLines where character is '':")
df = df[df['char_name'] != 'The song kicks up with the first verse once they look inside the barn']
print("\nRemoved rows with invalid character name")




Lines where character is '':

Removed rows with invalid character name


### Wirt

In [276]:
# Print character names containing 'wirt'
print("Character names containing 'Wirt':")
print(sorted(df[df['char_name'].str.contains('wirt', case=False, na=False)]['char_name'].unique()))


Character names containing 'Wirt':
['BWirt', 'Both Wirt and Beatrice', 'Greg, joining Wirt', 'Wirt', 'Wirt & Greg', 'Wirt and Greg', 'Wirt and Lorna', 'Wirt joins in', 'Wirt whispers in dread', 'Wirt, backing away', 'Wirt, halfway out the ditch, lying in the dirt', 'Wirt, hunched over and out of breath', 'Wirt, offscreen', 'Wirt, waking from his sleep']


In [277]:
# Standardize variations of 'Wirt' to just 'Wirt'
wirt_variations = [' Wirt', 'BWirt', '>>Wirt', 'Wirt joins in', 'Wirt whispers in dread', 'Wirt, backing away', 'Wirt, halfway out the ditch, lying in the dirt', 'Wirt, hunched over and out of breath', 'Wirt, offscreen', 'Wirt, waking from his sleep']
df.loc[df['char_name'].str.contains('|'.join(wirt_variations), case=False, na=False), 'char_name'] = 'Wirt'

# Print to verify changes
print("\nUnique character names containing 'Wirt' after standardization:")
print(sorted(df[df['char_name'].str.contains('wirt', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Wirt' after standardization:
['Wirt', 'Wirt & Greg', 'Wirt and Greg', 'Wirt and Lorna']


### Greg

In [278]:
# Print character names containing 'wirt'
print("Character names containing 'Greg':")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))


Character names containing 'Greg':
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg Sings', 'Greg and Beatrice', 'Greg contines singing', 'Greg continues singing', 'Greg salutes the Opossum', 'Greg sings', 'Greg starts singing', 'Greg starts to sing', 'Greg, after thinking for a moment', 'Greg, crawling out of his ditch', 'Wirt & Greg', 'Wirt and Greg']


In [279]:
# Standardize variations of 'Greg' to just 'Greg'
greg_variations = [f'Greg (threateningly)', 'Greg starts singing', '>>Greg', 'Greg, after thinking for a moment', 'Greg, crawling out of his ditch', 'Greg (threateningly)', 'Greg (threateningly)', 'Greg Sings', 'Greg contines singing', 'Greg continues singing', 'Greg salutes the Opossum', 'Greg sings', 'Greg starts to sing']
df.loc[df['char_name'].str.contains('|'.join(greg_variations), case=False, na=False), 'char_name'] = 'Greg'

# Print to verify changes
print("\nUnique character names containing 'Greg' after standardization:")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Greg' after standardization:
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg and Beatrice', 'Wirt & Greg', 'Wirt and Greg']


  df.loc[df['char_name'].str.contains('|'.join(greg_variations), case=False, na=False), 'char_name'] = 'Greg'


### Wirt and Greg

In [280]:
# Print character names containing 'wirt'
print("Character names containing 'Wirt and Greg':")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))


Character names containing 'Wirt and Greg':
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg and Beatrice', 'Wirt & Greg', 'Wirt and Greg']


In [281]:
# Standardize variations of 'Greg' to just 'Greg'
greg_variations = [f'Wirt & Greg']
df.loc[df['char_name'].str.contains('|'.join(greg_variations), case=False, na=False), 'char_name'] = 'Wirt and Greg'

# Print to verify changes
print("\nUnique character names containing 'Greg' after standardization:")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Greg' after standardization:
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg and Beatrice', 'Wirt and Greg']


### Beatrice

In [282]:
# Print character names containing 'wirt'
print("Character names containing 'Beatrice':")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))


Character names containing 'Beatrice':
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Beatrice, backing away', 'Beatrice, flying', 'Beatrice, from atop a branch', 'Beatrice, from within the bush', 'Beatrice, offscreen', 'Beatrice, on the ground by the crops', "Beatrice, sitting on Greg's teapot hat", 'Beatrice, speaking to the horse', 'Beatrice, taking off', 'Greg and Beatrice']


In [283]:
# Standardize variations of 'Beatrice' to just 'Beatrice'
beatrice_variations = ['Beatrice, backing away', 'Beatrice, flying', 'Beatrice, from atop a branch', 'Beatrice, from within the bush', 'Beatrice, offscreen', 'Beatrice, on the ground by the crops', "Beatrice, sitting on Greg's teapot hat", 'Beatrice, speaking to the horse', 'Beatrice, taking off']
df.loc[df['char_name'].str.contains('|'.join(beatrice_variations), case=False, na=False), 'char_name'] = 'Beatrice'

# Print to verify changes
print("\nUnique character names containing 'Beatrice' after standardization:")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Beatrice' after standardization:
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Greg and Beatrice']


### Beatrice's Mom

In [284]:
# Print character names containing 'wirt'
print("Character names containing 'Beatrice's Mom':")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))


Character names containing 'Beatrice's Mom':
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Greg and Beatrice']


In [285]:
# Standardize variations of 'Beatrice' to just 'Beatrice'
beatrice_variations = ["Beatrice's Mother", "Beatrice's mom"]
df.loc[df['char_name'].str.contains('|'.join(beatrice_variations), case=False, na=False), 'char_name'] = "Beatrice's Mom"

# Print to verify changes
print("\nUnique character names containing 'Beatrice' after standardization:")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Beatrice' after standardization:
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", 'Greg and Beatrice']


In [286]:
### Woodsman
# Print character names containing 'woodsman'
print("Character names containing 'Woodsman':")
print(sorted(df[df['char_name'].str.contains('woodsman', case=False, na=False)]['char_name'].unique()))

# Standardize variations of 'Woodsman' to just 'Woodsman'
woodsman_variations = ['Woodsman', 'The Wooodsman', 'The Woodsman']
df.loc[df['char_name'].str.contains('|'.join(woodsman_variations), case=False, na=False), 'char_name'] = 'The Woodsman'


Character names containing 'Woodsman':
['The Woodsman', "The Woodsman's Daughter", 'Woodsman']


## Save to CSV

In [288]:
# Save the cleaned dataframe to a CSV file
df.to_csv('data/cleaned_over_the_garden_wall_dialogue.csv', index=False)
print("Data saved to cleaned_over_the_garden_wall_dialogue.csv")


Data saved to cleaned_over_the_garden_wall_dialogue.csv
