In [185]:
import re
import requests
import bs4

In [186]:
NAME_DICT = {'BEN': "OBI-WAN", 'THREEPIO': "C-3PO",
             'DARTH SlDIOUS': "DARTH SIDIOUS", 'PALPATINE': "DARTH SIDIOUS",
             'EMPEROR': "DARTH SIDIOUS", 'FlRESHIP PILOT': "FIRESHIP PILOT",
             'GlDDEAN DANU': "GIDDEAN DANU", 'GiDDEAN DANU': "GIDDEAN DANU",
             'Kl-ADI-MUNDI': "KI-ADI-MUNDI", 'TlON MEDON': "TION MEDON",
             'MACE WlNDU': "MACE WINDU", 'MACE WiNDU': "MACE WINDU",
             'FANGZAR': "FANG ZAR", "PADMÉ": "PADME"}

## Star Wars Episode IV: A New Hope

In [187]:
with open('StarWars Scripts/StarWars_EpisodeIV_script.txt', 'r') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [188]:
NAME_WHITESPACE = 20
DIALOGUE_WHITESPACE = 10

In [189]:
f = open('StarWars Dialogues/StarWars_EpisodeIV_dialogues.txt', 'w')

for line in script[50:]:
    if len(line) - len(line.lstrip()) == NAME_WHITESPACE:
        name = line.strip()
        # Take care of e.g. BEN'S VOICE, RED TEN'S VOICE
        if ("'S") in name:
            name = re.search("^[A-Z\s]+[^']", name).group()
        name = NAME_DICT.get(name, name)
        f.write(name + '\t')

    elif len(line) - len(line.lstrip()) == DIALOGUE_WHITESPACE:
        if len(line) == len(line.rstrip()):
            f.write(line.strip() + '\n')
        else:
            f.write(line.lstrip())

f.close()

## Star Wars Episode V: The Empire Strikes Back

In [190]:
with open('StarWars Scripts/StarWars_EpisodeV_script.txt', 'r') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [191]:
f = open('StarWars Dialogues/StarWars_EpisodeV_dialogues.txt', 'w')

for i in range(60, len(script) - 1):
    line = script[i]
    next_line = script[i + 1]
    
    if (":") in line:
        name, dialogue = line.split(":")
        if name in ["INTERIOR", "EXTERIOR"]:
            continue
        if ("'S") in name:
            name = re.search("^[A-Z\s]+[^']", name).group()
        name = name.strip()
        name = NAME_DICT.get(name, name)
        if not next_line:
            f.write(name + "\t" + re.sub("[\(\[].*?[\)\]]", "", dialogue).strip() + "\n")
        else:
            f.write(name + "\t" + re.sub("[\(\[].*?[\)\]]", "", dialogue).strip() + " ")
    
    elif line and len(line) == len(line.lstrip()):
        if not next_line or len(next_line) != len(next_line.lstrip()):
            f.write(line + "\n")
        else:
            f.write(line + " ")

f.close()

## Star Wars Episode VI: Return of the Jedi

In [192]:
with open('StarWars Scripts/StarWars_EpisodeVI_script.txt', 'r') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [193]:
NAME_WHITESPACE = 30
DIALOGUE_WHITESPACE = 15

In [194]:
f = open('StarWars Dialogues/StarWars_EpisodeVI_dialogues.txt', 'w')

for i in range(70, len(script) - 1):
    line = script[i]

    if len(line) - len(line.lstrip()) == NAME_WHITESPACE:
        name = re.search("^[A-Z][\w\s]+[^\(#]", line.lstrip())
        if not name:
            continue
        name = name.group().strip()
        name = NAME_DICT.get(name, name)
        f.write(name + "\t")

    elif len(line) - len(line.lstrip()) == DIALOGUE_WHITESPACE:
        if not script[i + 1]:
            f.write(line.lstrip() + "\n")
        else:
            f.write(line.lstrip() + " ")

f.close()

## Star Wars Episode III: Revenge of the Sith

In [205]:
url = "https://www.imsdb.com/scripts/Star-Wars-Revenge-of-the-Sith.html"
request = requests.get(url)
text = request.text.encode('iso-8859-1')
soup = bs4.BeautifulSoup(text, "html5lib")
text = soup.find('td', class_ = "scrtext").text

with open('StarWars Scripts/StarWars_EpisodeIII_script.txt', 'w') as f:
    f.write(text.replace("\xa0", " "))

In [206]:
with open('StarWars Scripts/StarWars_EpisodeIII_script.txt', 'r') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [207]:
f = open('StarWars Dialogues/StarWars_EpisodeIII_dialogues.txt', 'w')

for line in script:
    if (":") in line and len(line) == len(line.lstrip()):
        name, dialogue = line.split(":", 1)
        name = NAME_DICT.get(name, name)
        name = re.search("^[A-Z0-9\s-]+[^a-z]", name).group()
        f.write(name.strip() + "\t" + re.sub("[\(\[].*?[\)\]]", "", dialogue).strip() + "\n")

f.close()

## Star Wars Episode II: Attack of the Clones

In [208]:
url = "https://www.imsdb.com/scripts/Star-Wars-Attack-of-the-Clones.html"
request = requests.get(url)
text = request.text.encode('iso-8859-1')
soup = bs4.BeautifulSoup(text, "html5lib")
text = soup.find('td', class_ = "scrtext").text

with open('StarWars Scripts/StarWars_EpisodeII_script.txt', 'wb') as f:
    f.write(text.replace("\t", "    ").encode('utf8'))

In [209]:
with open('StarWars Scripts/StarWars_EpisodeII_script.txt', 'r', encoding='utf-8') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [210]:
NAME_WHITESPACE = 16
DIALOGUE_WHITESPACE = 12

In [211]:
f = open('StarWars Dialogues/StarWars_EpisodeII_dialogues.txt', 'w', encoding='utf-8')

for i in range(30, len(script) - 1):
    line = script[i]
    next_line = script[i + 1]

    if len(line) - len(line.lstrip()) == NAME_WHITESPACE and ("(") not in line:
        name = line.strip()
        name = NAME_DICT.get(name, name)
        f.write(name + "\t")

    elif len(line) - len(line.lstrip()) == DIALOGUE_WHITESPACE:
        if not next_line:
            f.write(line.lstrip() + "\n")
        else:
            f.write(line.lstrip() + " ")

f.close()