In [None]:
import os
import requests
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString

base_dir = "tloz-scrape"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [None]:
base_url = "http://zelda.wikia.com"
characters = base_url + "/wiki/The_Legend_of_Zelda_recurring_characters"
recurring_characters = BeautifulSoup(requests.get(characters).text, "lxml")

In [None]:
mw_content_text = recurring_characters.find('div', {"id":"mw-content-text"})

attributes = set()
characters = []

for character_li in mw_content_text.findAll('li'):
    a = character_li.find('a')
    character_page = BeautifulSoup(requests.get(base_url + a['href']).text, "lxml")
    aside = character_page.find('aside')
    if aside is None:
        continue
    pi_datas = aside.findAll('div', {'class':'pi-item'}, recursive=False)
    character = {
        'name': a.text.strip()
    }
    for pi_data in pi_datas:
        if isinstance(pi_data, NavigableString):
            continue
        label = pi_data.find('h3').text.strip()
        value = pi_data.find('div', {"class":"pi-data-value"})
        attributes.add(label)
        if "Appears in" == label:
            i = value.findAll('i')
            character[slugify(label, separator="_")] = [ap.text.strip() for ap in i]
        elif "Title(s)" == label:
            character[slugify(label, separator="_")] = [ap.string.strip() 
                                                        for ap 
                                                        in value.descendants 
                                                        if ap.name == None]
        elif "Homeland" == label or "Race" == label or "Hometown" == label or "Location(s)" == label:
            character[slugify(label, separator="_")] = []
            game_name = ''
            for element in value.descendants:
                if element.name == "u":
                    game_name = element.text.strip()
                elif element.name == "a":
                    if game_name == '': # Solo tiene una aparición
                        game_name = 'ORIGINAL'
                    character[slugify(label, separator="_")].append({'game': game_name,
                                                                     'value': element.text.strip()})
        else:
            character[slugify(label, separator="_")] = value.text.strip()
    characters.append(character)
    
with open(join(base_dir,'data.txt'), 'w') as outfile:
    json.dump(characters, outfile, indent=4)

In [None]:
with open(join(base_dir,'data.txt')) as json_data:
    characters = json.load(json_data)

attributes = set()
for c in characters:
    attributes.update(c.keys())
attributes.remove('name')
attributes = list(attributes)

print("Different attributes:", attributes)

In [None]:
fa_regex = re.compile('([\w\s\'&]+)\(([0-9]{4})\)')
different_games = set()
name, first_game, first_year, gender = [], [], [], []

for c in characters:
    match = fa_regex.search(c['first_appearance'])
    if match:
        different_games.add(match.group(1).strip())
        
        first_game.append(match.group(1).strip())
        first_year.append(int(match.group(2)))
        name.append(c['name'])
        
        if 'gender' in c:
            gender.append(c['gender'])
        else:
            gender.append('')
        
characters_initial_df = pd.DataFrame({'name': name,
                                      'gender': gender,
                                      'first_game': first_game,
                                      'first_year': first_year
                             })

In [None]:
characters_initial_df.to_csv(join(base_dir,'characters_initial.csv'))
characters_initial_df.head()