In [3]:
import re
import requests
import bs4

In [4]:
NAME_DICT = {'BEN': "OBI-WAN",
             'THREEPIO': "C-3PO",
             'CREATURE': "YODA",
             'DARTH SlDIOUS': "DARTH SIDIOUS",
             'PALPATINE': "DARTH SIDIOUS",
             'EMPEROR': "DARTH SIDIOUS",
             'FlRESHIP PILOT': "FIRESHIP PILOT",
             'GlDDEAN DANU': "GIDDEAN DANU",
             'GiDDEAN DANU': "GIDDEAN DANU",
             'Kl-ADI-MUNDI': "KI-ADI-MUNDI",
             'TlON MEDON': "TION MEDON",
             'MACE WlNDU': "MACE WINDU",
             'MACE WiNDU': "MACE WINDU",
             'FANGZAR': "FANG ZAR",
             'QUI -GON': "QUI-GON",
             'PADMÉ': "PADME",
             'CORDÉ': "CORDE",
             'FODE/BEED': "FODE & BEED",
             'A': "FODE",
             'B': "BEED"}

In [5]:
def read_script(name, encoding=None):
    """
    Given name of the script file, reads it with assigned encoding method and
    returns a list of lines in the script with newline char removed.

    Inputs:
        name (string): name and extension of the script text file
        encoding (string): encoding method of the script file, i.e. 'gbk'

    Outputs:
        (list of strings) of lines in the script with newline char removed
    """
    with open('StarWars Scripts/' + name, 'r', encoding=encoding) as f:
        return [line.rstrip("\n") for line in f.readlines()]

In [6]:
def line_type(line, num_spaces):
    """
    Given a line, determines its type rouoghly by counting leading white
    spaces. Returns True if it equals to the desired number.

    Inputs:
        - line (string): a line from the list to check
        - num_spaces (int): number of spaces desired

    Returns:
        (Bool) True if the number of leading white spaces equals to our
        desired number
    """
    return len(line) - len(line.lstrip()) == num_spaces

In [7]:
def clean_name(line, name_party_line=NAME_DICT):
    """
    Given a line containing a name, parses the line and returns the name.

    Inputs:
        - line (string): a line from the list
        - name_party_line (dict): takes care of wrong spellings

    Returns:
        (string) cleaned name
    """
    name = line.strip()
    if ("'S") in name:
        name = re.search(r"^[A-Z0-9\s]+[^']", name).group().strip()
    
    if "(" in name:
        name = re.search(r"[\w]+[^(]", name).group().strip()
    name = name_party_line.get(name, name)

    name = re.search(r"^[A-Z0-9&\s-]+[^a-z#]", name)
    if not name:
        return None

    return name.group().strip()

In [15]:
def write_dialogue(condition, cond_line, write_line, file):
    """
    Given a dialogue line, write it into the tsv file under the assigned
    condition.

    Inputs:
        - condition (a function): evaluates to True or False
        - cond_line (string): a line from the script to check the condition
        - write_line (string): a line from the script to write
        - file (TextIOWrapper): connection to the .tsv file

    Returns:
        (None)
    """
    if condition(cond_line):
        file.write(write_line.strip() + "\n")
    else:
        file.write(write_line.strip() + " ")

## Star Wars Episode IV: A New Hope

In [32]:
script = read_script('StarWars_EpisodeIV_script.txt')
f = open('StarWars Dialogues/StarWars_EpisodeIV_dialogues.tsv', 'w')

for i in range(50, len(script) - 1):
    line, next_line = script[i], script[i + 1]

    if line_type(line, 20):
        name = clean_name(line, {key: val for key, val in NAME_DICT.items()
                                 if key != "CREATURE"})
        if not name:
            continue
        f.write(name + '\t')

    elif line_type(line, 10):
        write_dialogue(lambda x: not x.strip(), next_line, line, f)

f.close()

## Star Wars Episode V: The Empire Strikes Back

In [34]:
script = read_script('StarWars_EpisodeV_script.txt', encoding=None)
f = open('StarWars Dialogues/StarWars_EpisodeV_dialogues.tsv', 'w')

for i in range(60, len(script) - 1):
    line, next_line = script[i], script[i + 1]
    
    if (":") in line:
        name, dialogue = line.split(":")
        if name in ["INTERIOR", "EXTERIOR"]:
            continue

        name = clean_name(name, NAME_DICT)
        if not name:
            continue
        
        dialogue = re.sub(r"[\(\[].*?[\)\]]", "", dialogue).strip()
        write_dialogue(lambda x: not x, next_line, name + "\t" + dialogue, f)
    
    elif line and line_type(line, 0):
        write_dialogue(lambda x: not x.strip() or not line_type(x, 0),
                       next_line, line, f)

f.close()

## Star Wars Episode VI: Return of the Jedi

In [35]:
script = read_script('StarWars_EpisodeVI_script.txt', encoding=None)
f = open('StarWars Dialogues/StarWars_EpisodeVI_dialogues.tsv', 'w')

for i in range(70, len(script) - 1):
    line, next_line = script[i], script[i + 1]

    if line_type(line, 30):
        name = clean_name(line, NAME_DICT)
        if not name or name == "FADE OUT":
            continue
        f.write(name + "\t")

    elif line_type(line, 15):
        write_dialogue(lambda x: not x.strip(), next_line, line, f)

f.close()

## Star Wars Episode III: Revenge of the Sith

In [27]:
url = "https://www.imsdb.com/scripts/Star-Wars-Revenge-of-the-Sith.html"
request = requests.get(url)
text = request.text.encode('iso-8859-1')
soup = bs4.BeautifulSoup(text, "html5lib")
text = soup.find('td', class_ = "scrtext").text

with open('StarWars Scripts/StarWars_EpisodeIII_script.txt', 'w') as f:
    f.write(text.replace("\xa0", " "))

In [283]:
script = read_script('StarWars_EpisodeIII_script.txt', encoding=None)

In [284]:
f = open('StarWars Dialogues/StarWars_EpisodeIII_dialogues.tsv', 'w')

for line in script:
    if (":") in line and len(line) == len(line.lstrip()):
        name, dialogue = line.split(":", 1)
        name = clean_name(name, NAME_DICT)
        f.write(name.strip() + "\t" +
                re.sub("[\(\[].*?[\)\]]", "", dialogue).strip() + "\n")

f.close()

## Star Wars Episode II: Attack of the Clones

In [294]:
url = "https://www.imsdb.com/scripts/Star-Wars-Attack-of-the-Clones.html"
request = requests.get(url)
text = request.text.encode('iso-8859-1')
soup = bs4.BeautifulSoup(text, "html5lib")
text = soup.find('td', class_ = "scrtext").text

with open('StarWars Scripts/StarWars_EpisodeII_script.txt', 'wb') as f:
    f.write(text.replace("\t", "    ").encode('utf8'))

In [299]:
with open('StarWars Scripts/StarWars_EpisodeII_script.txt', 'r', encoding='utf-8') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [300]:
NAME_WHITESPACE = 16
DIALOGUE_WHITESPACE = 12

In [301]:
f = open('StarWars Dialogues/StarWars_EpisodeII_dialogues.txt', 'w', encoding='utf-8')

for i in range(30, len(script) - 1):
    line = script[i]
    next_line = script[i + 1]

    if len(line) - len(line.lstrip()) == NAME_WHITESPACE and ("(") not in line:
        name = clean_name(line, NAME_DICT)
        if not name:
            continue
        f.write(name + "\t")

    elif len(line) - len(line.lstrip()) == DIALOGUE_WHITESPACE:
        if not next_line:
            f.write(line.lstrip() + "\n")
        else:
            f.write(line.lstrip() + " ")

f.close()

## Star Wars Episode I: The Phantom Menace

In [66]:
url = "https://www.imsdb.com/scripts/Star-Wars-The-Phantom-Menace.html"
request = requests.get(url)
text = request.text.encode('iso-8859-1')
soup = bs4.BeautifulSoup(text, "html5lib")
text = soup.find('td', class_ = "scrtext").text

with open('StarWars Scripts/StarWars_EpisodeI_script.txt', 'w') as f:
    f.write(text.replace("\xa0", " "))

In [311]:
with open('StarWars Scripts/StarWars_EpisodeI_script.txt', 'r', encoding='utf-8') as f:
    script = [line.rstrip("\n") for line in f.readlines()]

In [2]:
f = open('StarWars Dialogues/StarWars_EpisodeI_dialogues.txt', 'w')

for i in range(30, len(script)-1):
    line = script[i]
    next_line = script[i + 1]

    if all([(":") in line, re.search("^[A-Z]+", line),
            len(line) == len(line.lstrip())]):
        name, dialogue = line.split(":", 1)
        name = clean_name(name, NAME_DICT)
        if not name:
            continue
        f.write(name.strip() + "\t" + re.sub("[\(\[].*?[\)\]]", "", dialogue).strip() + "\n")

f.close()

NameError: name 'script' is not defined

In [315]:
for i in range(30, len(script)-1):
    line = script[i]
    next_line = script[i + 1]
    
    if ":" in line and len(line) == len(line.lstrip()):
        print(line)

QUI-GON : (off screen voice) Captain.
CAPTAIN : Yes, sir?
QUI-GON : (V.O) Tell them we wish to board at once.
CAPTAIN : Yes, sir.
CAPTAIN : (cont'd) With all due respect for the Trade Federation, the
NUTE : Yes, yes, of coarse...ahhh...as you know, our blockade is perfectly
PK-4 : They must be important if the Viceroy sent one of those useless
TC-14 : I'm TC-14 at your service. This way, please.
EG-9 : A Republic cruiser! That's trouble...don't you think?
PR-4 : I'm not made to think.
TC-14 : I hope you honoured sirs with the most comfortable here. My master
OBI-WAN : I have a bad feeling about this.
QUI-GON : I don't sense anything.
OBI-WAN : It's not about the mission, Master, it's
QUI-GON : Don't center on your anxiety, Obi-Wan. Keep your concentration
OBI-WAN : Master Yoda says I should be mindful of the future...
QUI-GON : .....but not at the expense of the moment. Be mindful of the
OBI-WAN : Yes, Master...how do you think the trade viceroy will deal with
QUI-GON : These Federatio