In [6]:
import pandas as pd

# Read the text file
with open('data/OtGW_transcript.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Split into episodes and lines
episodes = text.split('\n\n')

# Create lists to store data
char_names = []
episode_nums = []
lines = []

# Parse each episode
for episode in episodes:
    if episode.strip():  # Skip empty episodes
        lines_in_episode = episode.split('\n')
        episode_num = lines_in_episode[0]  # First line contains episode number
        
        # Process each line of dialogue
        for line in lines_in_episode[1:]:  # Skip episode number line
            if ':' in line:
                char_name, dialogue = line.split(':', 1)
                char_names.append(char_name.strip())
                episode_nums.append(episode_num.strip())
                lines.append(dialogue.strip())

# Create DataFrame
df = pd.DataFrame({
    'char_name': char_names,
    'episode': episode_nums,
    'line': lines
})


### Basic Cleaning

In [36]:

df = df[~df['char_name'].str.contains('Chapter', na=False)]
df = df[~df['char_name'].str.startswith('[', na=False)]

# Remove text within square brackets from the 'line' column while keeping other text
df['line'] = df['line'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
df['line'] = df['line'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

# Remove '>>' from the start of character names
df['char_name'] = df['char_name'].str.replace(r'^>>\s*', '', regex=True)


df['char_name'] = df['char_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()
# Remove rows where 'line' is empty or null after cleaning
df = df[df['line'].notna() & (df['line'] != '')]


# Reset the index after filtering
df = df.reset_index(drop=True)


## Character Names Cleaning

In [45]:
# Print all unique character names
print("All unique character names:")
print(sorted(df['char_name'].unique()))


All unique character names:
['Adelaide', 'All', 'Angel', 'Angel Sings', 'Auntie Whispers', 'Auxiliary Committee', 'Beast', 'Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", 'Both', 'Braids Villager', 'Carriage Driver', 'Cherub', 'Cleaning frog crew member', 'Commitee Sings', 'Committee Number Three', 'Enoch', 'Fred the Horse', 'Frog', 'Frog Bassoon Player', 'Frog Constable', 'Frog Crewmember', 'Frog Crowd', 'Frog Lady', 'George Washington', 'George Washington sings', 'George Washingtong continues singing', 'Gramps Villager', 'Gramps Villager, offscreen', 'Greg', 'Greg and Beatrice', 'High School Boy', 'Jason', 'Jason F', 'Jason Funderberker', 'Jimmy', 'Kathleen', 'Kathleen and Rhondi', 'Larry', 'Lorna', 'Margueritte Grey', 'Mrs. Daniels', 'Narrator', 'Nerdy Girl', 'Old North Wind', 'Partygoer', 'Peacock', 'Pilgrim Pumpkin', 'Pilgrim Villager', 'Pilgrim Villager, nodding', 'Police Officer', 'Ponytails Villager', 'Queen of the Clouds', 'Quincy Endicott', 'Quincy Endicott & Margu

### Wirt

In [39]:
# Print character names containing 'wirt'
print("Character names containing 'Wirt':")
print(sorted(df[df['char_name'].str.contains('wirt', case=False, na=False)]['char_name'].unique()))


Character names containing 'Wirt':
['Wirt', 'Wirt and Greg']


In [33]:
# Standardize variations of 'Wirt' to just 'Wirt'
wirt_variations = [' Wirt', 'BWirt', '>>Wirt', 'Wirt joins in', 'Wirt whispers in dread', 'Wirt, backing away', 'Wirt, halfway out the ditch, lying in the dirt', 'Wirt, hunched over and out of breath', 'Wirt, offscreen', 'Wirt, waking from his sleep']
df.loc[df['char_name'].str.contains('|'.join(wirt_variations), case=False, na=False), 'char_name'] = 'Wirt'

# Print to verify changes
print("\nUnique character names containing 'Wirt' after standardization:")
print(sorted(df[df['char_name'].str.contains('wirt', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Wirt' after standardization:
['Wirt', 'Wirt & Greg', 'Wirt and Greg']


### Greg

In [34]:
# Print character names containing 'wirt'
print("Character names containing 'Greg':")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))


Character names containing 'Greg':
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg and Beatrice', 'Wirt & Greg', 'Wirt and Greg']


In [35]:
# Standardize variations of 'Greg' to just 'Greg'
greg_variations = [f'Greg (threateningly)', 'Greg starts singing', '>>Greg', 'Greg, after thinking for a moment', 'Greg, crawling out of his ditch', 'Greg (threateningly)', 'Greg (threateningly)']
df.loc[df['char_name'].str.contains('|'.join(greg_variations), case=False, na=False), 'char_name'] = 'Greg'

# Print to verify changes
print("\nUnique character names containing 'Greg' after standardization:")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Greg' after standardization:
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg and Beatrice', 'Wirt & Greg', 'Wirt and Greg']


  df.loc[df['char_name'].str.contains('|'.join(greg_variations), case=False, na=False), 'char_name'] = 'Greg'


### Wirt and Greg

In [37]:
# Print character names containing 'wirt'
print("Character names containing 'Wirt and Greg':")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))


Character names containing 'Wirt and Greg':
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg and Beatrice', 'Wirt & Greg', 'Wirt and Greg']


In [38]:
# Standardize variations of 'Greg' to just 'Greg'
greg_variations = [f'Wirt & Greg']
df.loc[df['char_name'].str.contains('|'.join(greg_variations), case=False, na=False), 'char_name'] = 'Wirt and Greg'

# Print to verify changes
print("\nUnique character names containing 'Greg' after standardization:")
print(sorted(df[df['char_name'].str.contains('greg', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Greg' after standardization:
["Beatrice, sitting on Greg's teapot hat", 'Greg', 'Greg and Beatrice', 'Wirt and Greg']


### Beatrice

In [40]:
# Print character names containing 'wirt'
print("Character names containing 'Beatrice':")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))


Character names containing 'Beatrice':
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Beatrice, backing away', 'Beatrice, flying', 'Beatrice, from atop a branch', 'Beatrice, from within the bush', 'Beatrice, offscreen', 'Beatrice, on the ground by the crops', "Beatrice, sitting on Greg's teapot hat", 'Beatrice, speaking to the horse', 'Beatrice, taking off', 'Greg and Beatrice']


In [42]:
# Standardize variations of 'Beatrice' to just 'Beatrice'
beatrice_variations = ['Beatrice, backing away', 'Beatrice, flying', 'Beatrice, from atop a branch', 'Beatrice, from within the bush', 'Beatrice, offscreen', 'Beatrice, on the ground by the crops', "Beatrice, sitting on Greg's teapot hat", 'Beatrice, speaking to the horse', 'Beatrice, taking off']
df.loc[df['char_name'].str.contains('|'.join(beatrice_variations), case=False, na=False), 'char_name'] = 'Beatrice'

# Print to verify changes
print("\nUnique character names containing 'Beatrice' after standardization:")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Beatrice' after standardization:
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Greg and Beatrice']


### Beatrice's Mom

In [43]:
# Print character names containing 'wirt'
print("Character names containing 'Beatrice's Mom':")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))


Character names containing 'Beatrice's Mom':
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", "Beatrice's Mother", "Beatrice's mom", 'Greg and Beatrice']


In [44]:
# Standardize variations of 'Beatrice' to just 'Beatrice'
beatrice_variations = ["Beatrice's Mother", "Beatrice's mom"]
df.loc[df['char_name'].str.contains('|'.join(beatrice_variations), case=False, na=False), 'char_name'] = "Beatrice's Mom"

# Print to verify changes
print("\nUnique character names containing 'Beatrice' after standardization:")
print(sorted(df[df['char_name'].str.contains('beatrice', case=False, na=False)]['char_name'].unique()))



Unique character names containing 'Beatrice' after standardization:
['Beatrice', 'Beatrice & Fred the Horse', "Beatrice's Mom", 'Greg and Beatrice']


## Save to CSV

In [47]:
# Save the cleaned dataframe to a CSV file
df.to_csv('data/cleaned_over_the_garden_wall_dialogue.csv', index=False)
print("Data saved to cleaned_over_the_garden_wall_dialogue.csv")


Data saved to cleaned_over_the_garden_wall_dialogue.csv
