# Preprocessing (Scraping and fixing data)


In [1]:
import urllib2
import json
import re
import numpy as np
import csv
import os

In [2]:
def fetch_characters():
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    title = "titles=List_of_Star_Wars_characters"
    content = "prop=revisions"
    rvprop ="rvprop=timestamp|content"
    dataformat = "format=json"
    rvdir = "rvdir=older" #sort revisions from newest to oldest
    end = "rvend=2000-01-03T00:00:00Z" #start of my time period
    start = "rvstart=2019-01-03T00:00:00Z" #end of my time period
    limit = "rvlimit=1" #consider only the first revision

    query = "%s%s&%s&%s&%s&%s&%s&%s&%s&%s" % \
    (baseurl, action, title, content, rvprop, dataformat, rvdir, end, start, limit)
    response = urllib2.urlopen(query)
    wikisource = response.read()
    wikijson = json.loads(wikisource)
    wikiid = wikijson["query"]["pages"].keys()[0]
    text = wikijson["query"]["pages"][wikiid]["revisions"][-1]["*"]
    # All characters on this page match the regex below
    regex = "\{\{visible anchor\|(.*?)\}\}"
    chars = re.findall(regex,text)
    characters = []
    for c in chars :
        for s in c.split("|") :
            st = ((s.replace("[", "")).replace("]","")).replace(" ","_")
            # some prefix/suffix fixes
            if (st.startswith("Admiral_")):
                st = st[8:]
            st = st.replace("_(Star_Wars)", "")
            characters.append(st)   
    # Anakin skywalker and Darth Sidious is a special case
    characters.append("Anakin_Skywalker")
    characters.append("Darth_Sidious")
    return set(characters)


def fetch_wiki_article(title_):
    baseurl = "http://starwars.wikia.com/api.php?"
    action = "action=query"
    title = "titles=" + title_ + "&&redirects" # Redirects are gods gift to man
    title = title.encode("utf-8") # This is our fix for unicode problems
    content = "prop=revisions"
    rvprop ="rvprop=timestamp|content"
    dataformat = "format=json"
    rvdir = "rvdir=older" #sort revisions from newest to oldest
    end = "rvend=2000-01-03T00:00:00Z" #start of my time period
    start = "rvstart=2019-01-03T00:00:00Z" #end of my time period
    limit = "rvlimit=1" #consider only the first revision

    query = "%s%s&%s&%s&%s&%s&%s&%s&%s&%s" % \
    (baseurl, action, title, content, rvprop, dataformat, rvdir, end, start, limit)
    response = urllib2.urlopen(query)
    wikisource = response.read()
    wikijson = json.loads(wikisource)
    wikiid = wikijson["query"]["pages"].keys()[0]
    title = wikijson["query"]["pages"][wikiid]["title"]
    text = None
    # Below is equivalent to check if page exists
    if wikiid != "-1" :
        text = wikijson["query"]["pages"][wikiid]["revisions"][-1]["*"]
    # Legends is the comic books of starwars, which is sometimes redirected to.
    # we don't want the characters from that
    if title.endswith("/Legends"): 
        title = title.replace("/Legends", "").replace(" ", "_")
        return fetch_wiki_article(title) 
    return wikiid, text, title

def addToDict(character):
    wiki_id, text, wiki_title = fetch_wiki_article(character)
    if wiki_id == "-1":
        return False
    if wiki_id not in wiki_ids:
        wiki_ids.add(wiki_id)
        # Add the wookiepedia title as key and not the character name from wiki
        charDict[wiki_title] = text
    return True
    

In [3]:
charDict = {}
characters = fetch_characters()
wiki_ids = set()
leftovers = []
for c in characters:
    if not addToDict(c):
        leftovers.append(c)

## Adding missing characters, deleting some that shouldn't be there

In [4]:
# Cleaning up leftover characters

# Try right side of all leftover characters separated by "_", since a lot of them starts with some stupid prefix
fixed_leftovers = []
for character in leftovers:
    char = character.split("_")[-1]
    if not addToDict(char):
        fixed_leftovers.append(character)

print("LIST OF CHARACTERS NOT FOUND. HAS LENGTH: %s" % len(fixed_leftovers))
print(fixed_leftovers)
manual_fixes = ["Tallissan_Lintra", "Aiolin_Astarte", "Morit_Astarte", "CC-2224", 
               "Garazeb_Orrelios", "Kaydel_Ko_Connix", "Dooku", "Weequay", "Orrimaarko", "Rinnrivin_Di",
               "CC-3714", "Temmin_Wexley", "RA-7_protocol_droid", "Breha_Organa", "Saelt-Marae",
                "The_Grand_Inquisitor", "Kaplan_(colonel)"]
for character in manual_fixes:
    addToDict(character)

to_delete = ["Hammerhead", "Velus", "Star Wars: Doctor Aphra", "Kaplan", "Hutt", "Bail Prestor Organa",
            "Teedo", "Karina the Great", "Grand Inquisitor", "Fulcrum", "Weequay", "Senator Organa", "Tup",
            "Rogue Squadron", "Emperor's Royal Guard"]

for char in to_delete:
    del charDict[char]


LIST OF CHARACTERS NOT FOUND. HAS LENGTH: 18
[u'Inquisitors', u'Tallissan_"Tallie"_Lintra', u'Aiolin_and_Morit_Astarte', u'Commander_Cody_(CC-2224)', u'Garazeb_"Zeb"_Orrelios', u'Lieutenant_Kaydel_Ko_Connix', u'Count_Dooku<br>{{small', u'Pagetti_Rook_("Weequay")', u'Orrimaarko_("Prune_Face")', u'Has_Obbit', u'Rinnriyin_Di', u'Commander_Fil_(CC-3714)', u'Lieutenant_Connix', u'Orrimarko', u'Temmin_"Snap"_Wexley', u'RA-7_("Death_Star_droid")', u'Breha_Antilles-Organa', u'Saelt-Marae_("Yak_Face")']


## Finding affiliations and goodness score

In [5]:
affil_dict = {}
affil_dict["Galactic Empire"] = -1
affil_dict["Galactic Republic"] = 1
affil_dict["Alliance to Restore the Republic"] = 1
affil_dict["Confederacy of Independent Systems"] = -1
affil_dict["First Order"] = -1
affil_dict["Resistance"] = 1
affil_dict["New Republic"] = 1
affil_dict["Trade Federation"] = -1
affil_dict["Sith"] = -1
affil_dict["Jedi Order"] = 1
affil_dict["Crimson Dawn"] = -1
affil_dict["Cloud-Riders"] = 1

char_affil_dict = {}
for char, text in charDict.items():
    affiliation_section = re.findall("affiliation=[\S\s]*?\}\}\n", text)
    affiliations = re.findall('\[\[(.*?)\]\]', affiliation_section[0])
    for affiliation in affiliations:
        for affil in affiliation.split("|"):
            if affil in affil_dict:
                if char in char_affil_dict:
                    char_affil_dict[char].append(affil)
                else:
                    char_affil_dict[char] = [affil]

char_goodness_dict = {}
# Goodness score
for char, text in charDict.items():
    goodness = 0
    if char in char_affil_dict:
        affils = char_affil_dict[char]
        for affil in affils:
            goodness += affil_dict[affil]
    char_goodness_dict[char] = goodness

## Finding species

In [6]:
char_species_dict = {}
remaining = []
for char, text in charDict.items():
    species = re.findall("species=\*?\[\[(.*?)\]\]", text)
    robot_class = re.findall("class=\*?\[\[(.*?)\]\]", text)
    if species:
        species = species[0].lower()
        if "|" in species:
            species = species.split("|")[1]
            species = "yoda's species"
        char_species_dict[char] = species
    elif robot_class:
        robot_class = robot_class[0].lower()
        robot_class = robot_class.split("|")[0]
        char_species_dict[char] = "robot: %s" % robot_class
    else:
        remaining.append(char)

# Manual corrections:
char_species_dict["Maz Kanata"] = "humanoid"
char_species_dict["Tasu Leech"] = "human"
char_species_dict["Bendu"] = "unknown"
char_species_dict["Gallius Rax"] = "human"
char_species_dict["L3-37"] = "pilot droid"
char_species_dict["WAC-47"] = "pit droid"
char_species_dict["Sixth Brother"] = "human"
char_species_dict["Cylo"] = "humanoid"
char_species_dict["Jaxxon"] = "rabbit"
char_species_dict["Fifth Brother"] = "humanoid"
char_species_dict["Yaddle"] = "yoda's species"

## Saving characters and their wiki pages to disk

In [7]:
with open('starwarscharacters.csv', 'w+') as csvfile:
    fieldnames = ['name','wookieepedia_name', 'species', 'goodness']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for name in charDict.keys() :
        wookieename = name.replace(" ", "_")
        newName = name.split("/")[0]
        writer.writerow({'name': newName.encode("utf-8"),
                         'wookieepedia_name': wookieename.encode("utf-8"),
                         'species' : char_species_dict[name], 'goodness' : char_goodness_dict[name]})

for name, text in charDict.items():
    fileName = name.replace(" ", "_")
    fileName = fileName.split("/")[0]
    f = open("./Wookiepediafiles/" + fileName + ".txt", "w+")
    f.write(text.encode("utf-8"))
    f.close()


## Movie transcripts

In [8]:
regex = "\<pre\>((.|\s)*?)\<\/pre\>"
cwd = os.getcwd()
os.chdir(cwd + "/Scripts")
movietitles = ["A-New-Hope", "Attack-of-the-Clones", "Return-of-the-Jedi", "Revenge-of-the-Sith", "The-Empire-Strikes-Back", "The-Force-Awakens", "The-Phantom-Menace"]
for title in movietitles :
    response = urllib2.urlopen("https://www.imsdb.com/scripts/Star-Wars-"+title+".html")
    source = response.read()
    script = re.findall(regex,source)
    if script == [] :
        reg = "(STAR WARS EPISODE ((.|\s)*?) END TITLES)"
        script = re.findall(reg,source)
    script = script[0][0]
    script = script.replace("<b>", "")
    script = script.replace("<br>", "")
    script = script.replace("</b>", "")
    script = script.replace("</br>", "")
    f = open(title + ".txt", "w+")
    f.write(unicode(script, errors='ignore'))
    f.close()
os.chdir(cwd)


## The last jedi is not in imsdb

In [9]:
baseurl = "http://transcripts.wikia.com/api.php?"
action = "action=query"
title = "titles=Star_Wars_Episode_VIII:_The_Last_Jedi"
content = "prop=revisions"
rvprop ="rvprop=timestamp|content"
dataformat = "format=json"
rvdir = "rvdir=older" #sort revisions from newest to oldest
end = "rvend=2000-01-03T00:00:00Z" #start of my time period
start = "rvstart=2019-01-03T00:00:00Z" #end of my time period
limit = "rvlimit=1" #consider only the first revision
query = "%s%s&%s&%s&%s&%s&%s&%s&%s&%s" % \
(baseurl, action, title, content, rvprop, dataformat, rvdir, end, start, limit)
response = urllib2.urlopen(query)
wikisource = response.read()
wikijson = json.loads(wikisource)
wikiid = wikijson["query"]["pages"].keys()[0]
text = wikijson["query"]["pages"][wikiid]["revisions"][-1]["*"]
text = text.replace("<p style=\"text-align:center;\">","")
text = text.replace("</p>","")
texts = text.split("[")
text = texts[0]
os.chdir(cwd + "/Scripts")
f = open("The-Last-Jedi.txt", "w+")
f.write(text.encode("utf-8"))
f.close()
os.chdir(cwd)
