In [2]:
import re
import os

# Read the script file
with open('script.txt', 'r', encoding='utf-8') as file:
    script_text = file.read()

print("Original script length:", len(script_text))


Original script length: 164837


In [3]:
def clean_script(text):
    # Split the text into lines
    lines = text.split('\n')
    cleaned_lines = []
    current_character = None
    
    # Patterns to remove
    skip_patterns = [
        r'^CONTINUED',
        r'^FADE IN',
        r'^EXT\.',
        r'^INT\.',
        r'^\(.*\)$',  # Stage directions in parentheses
        r'^\[.*\]$',  # Notes in brackets
        r'^CUT TO:',
        r'^DISSOLVE TO:',
        r'^ANGLE ON',
        r'^CLOSE ON',
        r'^WIDE ON',
        r'^POV',
        r'^SCENE',
    ]
    
    skip_pattern = '|'.join(skip_patterns)
    
    for line in lines:
        # Strip whitespace
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
            
        # Skip lines matching our patterns
        if re.match(skip_pattern, line, re.IGNORECASE):
            continue
            
        # Check if line is a character name (all caps)
        if line.isupper() and len(line) > 1 and not line.startswith('('):
            current_character = line
            cleaned_lines.append(line)
        # If it's dialogue and we have a character speaking
        elif current_character and line and not line.isupper():
            cleaned_lines.append(line)
            
    return '\n'.join(cleaned_lines)

# Clean the script
cleaned_text = clean_script(script_text)

# Print a sample of the cleaned text
print("Sample of cleaned text:")
print(cleaned_text[:500])


Sample of cleaned text:
HARRY POTTER AND
THE CHAMBER OF SECRETS
screenplay by STEVEN KLOVES
based on the novel by
J.K. ROWLING
1
1
WIDE HELICOPTER SHOT. Privet Drive. CAMERA CRANES DOWN,
DOWN, OVER the rooftops, FINDS the SECOND FLOOR WINDOW of
NUMBER 4. HARRY POTTER sits in the window.
2
OMITTED
2
3
3
Harry pages through a SCRAPBOOK, stops on a MOVING PHOTO
of Ron and Hermione. SQUAWK! Harry jumps. HEDWIG pecks
at the LOCK slung through her cage door, then glowers at
Harry.
HARRY
I can't, Hedwig. I'm not allowed
to us


In [4]:
# Save the cleaned text to a new file
with open('cleaned_script.txt', 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

print("Cleaned script has been saved to 'cleaned_script.txt'")


Cleaned script has been saved to 'cleaned_script.txt'


In [5]:
def improved_clean_script(text):
    # Split the text into lines
    lines = text.split('\n')
    cleaned_lines = []
    current_character = None
    
    # Additional patterns to remove
    skip_patterns = [
        r'^CONTINUED',
        r'^FADE IN',
        r'^EXT\.',
        r'^INT\.',
        r'^\(.*\)$',  # Stage directions in parentheses
        r'^\[.*\]$',  # Notes in brackets
        r'^CUT TO:',
        r'^DISSOLVE TO:',
        r'^ANGLE ON',
        r'^CLOSE ON',
        r'^WIDE ON',
        r'^POV',
        r'^SCENE',
        r'^\d+$',  # Scene numbers
        r'^OMITTED$',
        r'^THE CHAMBER OF SECRETS',  # Script title headers
        r'^Rev\.',  # Revision dates
        r'^\d+\.\d+\.$',  # Page numbers
        r'^HARRY POTTER AND',  # Title
        r'^screenplay by',
        r'^based on',
        r'O\.S\.',  # Off-screen indicators
        r'CAMERA',
        r'HELICOPTER SHOT',
    ]
    
    skip_pattern = '|'.join(skip_patterns)
    
    # Flag to skip the header section
    header_passed = False
    
    for line in lines:
        # Strip whitespace
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
        
        # Skip header until we find actual dialogue
        if not header_passed:
            if 'HARRY POTTER AND' in line or 'screenplay by' in line or 'based on' in line:
                continue
            else:
                header_passed = True
        
        # Skip lines matching our patterns
        if re.match(skip_pattern, line, re.IGNORECASE):
            continue
        
        # Remove stage directions within dialogue (text in parentheses)
        line = re.sub(r'\(.*?\)', '', line).strip()
        
        # Skip if line became empty after removing parentheses
        if not line:
            continue
            
        # Check if line is a character name (all caps)
        if line.isupper() and len(line) > 1 and not line.startswith('('):
            # Remove any numbers or special characters from character names
            current_character = re.sub(r'[^A-Z\s]', '', line).strip()
            if current_character:  # Only add if there's still a name after cleaning
                cleaned_lines.append(current_character)
        # If it's dialogue and we have a character speaking
        elif current_character and line and not line.isupper():
            # Clean up the dialogue line
            dialogue = line.strip()
            if dialogue:  # Only add non-empty dialogue
                cleaned_lines.append(dialogue)
    
    return '\n'.join(cleaned_lines)

# Clean the script with improved function
improved_cleaned_text = improved_clean_script(script_text)

# Save the improved cleaned text
with open('improved_cleaned_script.txt', 'w', encoding='utf-8') as file:
    file.write(improved_cleaned_text)

# Print a sample of the improved cleaned text
print("Sample of improved cleaned text:")
print(improved_cleaned_text[:500])


Sample of improved cleaned text:
JK ROWLING
WIDE HELICOPTER SHOT. Privet Drive. CAMERA CRANES DOWN,
DOWN, OVER the rooftops, FINDS the SECOND FLOOR WINDOW of
NUMBER 4. HARRY POTTER sits in the window.
Harry pages through a SCRAPBOOK, stops on a MOVING PHOTO
of Ron and Hermione. SQUAWK! Harry jumps. HEDWIG pecks
at the LOCK slung through her cage door, then glowers at
Harry.
HARRY
I can't, Hedwig. I'm not allowed
to use magic outside of school.
Besides, if Uncle Vernon -At the sound of the name, HEDWIG SQUAWKS again, LOUDER.
UNC


In [6]:
# Analyze the cleaned text
def analyze_script(text):
    lines = text.split('\n')
    character_lines = {}
    current_character = None
    
    for line in lines:
        if line.isupper():
            current_character = line
            if current_character not in character_lines:
                character_lines[current_character] = 0
        elif current_character:
            character_lines[current_character] += 1
    
    print("Number of characters:", len(character_lines))
    print("\nTop 10 characters by number of lines:")
    sorted_chars = sorted(character_lines.items(), key=lambda x: x[1], reverse=True)
    for char, lines in sorted_chars[:10]:
        print(f"{char}: {lines} lines")

print("Analysis of the improved cleaned script:")
analyze_script(improved_cleaned_text)


Analysis of the improved cleaned script:
Number of characters: 137

Top 10 characters by number of lines:
HARRY: 930 lines
RON: 633 lines
HERMIONE: 402 lines
GILDEROY LOCKHART: 283 lines
DUMBLEDORE: 223 lines
TOM RIDDLE: 213 lines
HAGRID: 156 lines
LUCIUS MALFOY: 148 lines
DRACO: 143 lines
DOBBY: 113 lines
