In [109]:
import re
import requests
import bs4

In [199]:
NAME_DICT = {'BEN': "OBI-WAN",
             'THREEPIO': "C-3PO",
             'CREATURE': "YODA",
             'DARTH SlDIOUS': "DARTH SIDIOUS",
             'PALPATINE': "DARTH SIDIOUS",
             'EMPEROR': "DARTH SIDIOUS",
             'FlRESHIP PILOT': "FIRESHIP PILOT",
             'GlDDEAN DANU': "GIDDEAN DANU",
             'GiDDEAN DANU': "GIDDEAN DANU",
             'Kl-ADI-MUNDI': "KI-ADI-MUNDI",
             'TlON MEDON': "TION MEDON",
             'MACE WlNDU': "MACE WINDU",
             'MACE WiNDU': "MACE WINDU",
             'FANGZAR': "FANG ZAR",
             'QUI -GON': "QUI-GON",
             'PADMÉ': "PADME",
             'CORDÉ': "CORDE"}

In [200]:
def line_type(line, num_spaces):
    """
    Given a line, determines its type rouoghly by counting leading white
    spaces. Returns True if it equals to the desired number.

    Inputs:
        - line (string): a line from the list to check
        - num_spaces (int): number of spaces desired

    Returns:
        (Bool) True if the number of leading white spaces equals to our
        desired number
    """
    return len(line) - len(line.lstrip()) == num_spaces

In [201]:
def clean_name(line, name_party_line=NAME_DICT):
    """
    Given a line containing a name, parses the line and returns the name.

    Inputs:
        - line (string): a line from the list
        - name_party_line (dict): takes care of wrong spellings

    Returns:
        (string) cleaned name
    """
    name = line.lstrip()
    if ("'S") in name:
        name = re.search(r"^[A-Z0-9\s]+[^']", name).group().strip()

    name = name_party_line.get(name, name)

    name = re.search(r"^[A-Z0-9\s-]+[^a-z]", name)
    if not name:
        return None

    return re.sub(r"[\(\[].*?[\)\]]", "", name.group()).strip()

## Star Wars Episode IV: A New Hope

In [202]:
with open('StarWars Scripts/StarWars_EpisodeIV_script.txt', 'r') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [203]:
NAME_WHITESPACE = 20
DIALOGUE_WHITESPACE = 10

In [204]:
f = open('StarWars Dialogues/StarWars_EpisodeIV_dialogues.tsv', 'w')

for line in script[50:]:
    if line_type(line, NAME_WHITESPACE):
        name = clean_name(line, {key: val for key, val in NAME_DICT
                                 if key != "CREATURE"})
        if name:
            f.write(name + '\t')

    elif line_type(line, DIALOGUE_WHITESPACE):
        if len(line) == len(line.rstrip()):
            f.write(line.strip() + '\n')
        else:
            f.write(line.lstrip())

f.close()

ValueError: too many values to unpack (expected 2)

## Star Wars Episode V: The Empire Strikes Back

In [188]:
with open('StarWars Scripts/StarWars_EpisodeV_script.txt', 'r') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [189]:
f = open('StarWars Dialogues/StarWars_EpisodeV_dialogues.tsv', 'w')

for i in range(60, len(script) - 1):
    line = script[i]
    next_line = script[i + 1]
    
    if (":") in line:
        name, dialogue = line.split(":")
        if name in ["INTERIOR", "EXTERIOR"]:
            continue

        name = clean_name(name, NAME_DICT)
        if name:
            dialogue = re.sub("[\(\[].*?[\)\]]", "", dialogue).strip()
            if not next_line:
                f.write(name + "\t" + dialogue + "\n")
            else:
                f.write(name + "\t" + dialogue + " ")
    
    elif line and len(line) == len(line.lstrip()):
        if not next_line or len(next_line) != len(next_line.lstrip()):
            f.write(line + "\n")
        else:
            f.write(line + " ")

f.close()

## Star Wars Episode VI: Return of the Jedi

In [24]:
with open('StarWars Scripts/StarWars_EpisodeVI_script.txt', 'r') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [25]:
NAME_WHITESPACE = 30
DIALOGUE_WHITESPACE = 15

In [190]:
f = open('StarWars Dialogues/StarWars_EpisodeVI_dialogues.txt', 'w')

for i in range(70, len(script) - 1):
    line = script[i]

    if line_type(line, NAME_WHITESPACE):
        name = clean_name(line, NAME_DICT)
        f.write(name + "\t")

    elif line_type(line, DIALOGUE_WHITESPACE):
        if not script[i + 1]:
            f.write(line.lstrip() + "\n")
        else:
            f.write(line.lstrip() + " ")

f.close()

## Star Wars Episode III: Revenge of the Sith

In [27]:
url = "https://www.imsdb.com/scripts/Star-Wars-Revenge-of-the-Sith.html"
request = requests.get(url)
text = request.text.encode('iso-8859-1')
soup = bs4.BeautifulSoup(text, "html5lib")
text = soup.find('td', class_ = "scrtext").text

with open('StarWars Scripts/StarWars_EpisodeIII_script.txt', 'w') as f:
    f.write(text.replace("\xa0", " "))

In [28]:
with open('StarWars Scripts/StarWars_EpisodeIII_script.txt', 'r') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [29]:
f = open('StarWars Dialogues/StarWars_EpisodeIII_dialogues.txt', 'w')

for line in script:
    if (":") in line and len(line) == len(line.lstrip()):
        name, dialogue = line.split(":", 1)
        name = NAME_DICT.get(name, name)
        name = re.search("^[A-Z0-9\s-]+[^a-z]", name).group()
        f.write(name.strip() + "\t" + re.sub("[\(\[].*?[\)\]]", "", dialogue).strip() + "\n")

f.close()

## Star Wars Episode II: Attack of the Clones

In [30]:
url = "https://www.imsdb.com/scripts/Star-Wars-Attack-of-the-Clones.html"
request = requests.get(url)
text = request.text.encode('iso-8859-1')
soup = bs4.BeautifulSoup(text, "html5lib")
text = soup.find('td', class_ = "scrtext").text

with open('StarWars Scripts/StarWars_EpisodeII_script.txt', 'wb') as f:
    f.write(text.replace("\t", "    ").encode('utf8'))

In [31]:
with open('StarWars Scripts/StarWars_EpisodeII_script.txt', 'r', encoding='utf-8') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [32]:
NAME_WHITESPACE = 16
DIALOGUE_WHITESPACE = 12

In [33]:
f = open('StarWars Dialogues/StarWars_EpisodeII_dialogues.txt', 'w', encoding='utf-8')

for i in range(30, len(script) - 1):
    line = script[i]
    next_line = script[i + 1]

    if len(line) - len(line.lstrip()) == NAME_WHITESPACE and ("(") not in line:
        name = line.strip()
        name = NAME_DICT.get(name, name)
        f.write(name + "\t")

    elif len(line) - len(line.lstrip()) == DIALOGUE_WHITESPACE:
        if not next_line:
            f.write(line.lstrip() + "\n")
        else:
            f.write(line.lstrip() + " ")

f.close()

## Star Wars Episode I: The Phantom Menace

In [66]:
url = "https://www.imsdb.com/scripts/Star-Wars-The-Phantom-Menace.html"
request = requests.get(url)
text = request.text.encode('iso-8859-1')
soup = bs4.BeautifulSoup(text, "html5lib")
text = soup.find('td', class_ = "scrtext").text

with open('StarWars Scripts/StarWars_EpisodeI_script.txt', 'w') as f:
    f.write(text.replace("\xa0", " "))

In [67]:
with open('StarWars Scripts/StarWars_EpisodeI_script.txt', 'r', encoding='utf-8') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [68]:
f = open('StarWars Dialogues/StarWars_EpisodeI_dialogues.txt', 'w')

for line in script[30:]:
    if all([(":") in line, re.search("^[A-Z]+", line),
            len(line) == len(line.lstrip())]):
        name, dialogue = line.split(":", 1)
        name = NAME_DICT.get(name.strip(), name.strip())
        if name in ("A", "B", "FODE/BEED", "(O.S) A"):
            continue
        name = re.search("^[A-Z0-9\s-]+[^a-z]", name).group()
        f.write(name.strip() + "\t" + re.sub("[\(\[].*?[\)\]]", "", dialogue).strip() + "\n")

f.close()