## Guide to Basic Scraping: Getting National Pokemon Dex Info
based off code from Sir Briane Paul V. Samson (https://brianesamson.com/)<br>
done as an activity for Data Science in DLSU (DATASCI)

In [None]:
import requests
from bs4 import BeautifulSoup

### Get the HTML of the page you want to scrape

In [None]:
URL = 'https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number'
html = requests.get(URL).content

html

In [None]:
soup = BeautifulSoup(html, 'html.parser')
soup

### Find the content you want to get
Looking at the actual page, we see that the Pokemon data is stored in tables so we find all table tags (put into a list by BeautifulSoup)

In [None]:
content = soup.find(id='mw-content-text')
poke_tables = content.find_all('table')
poke_tables

If we pretend we can't actually look at the page, we can check the contents of the tables in the list of tables we got.

In [None]:
def combine_whitespace(string):
    if not None:
        return ' '.join(string.split())
    else:
        return None
    
for i in range(len(poke_tables)):
    print(f'Table {i:}:\n {combine_whitespace(str(poke_tables[i].contents[0]))}')

In [None]:
chars = 500
for i in range(len(poke_tables)):
    text = poke_tables[i].contents[1].get_text()[0:chars]
    print(f'Table {i:}:\n {combine_whitespace(text)}')

We find that we only really care about tables 1 to 9, and all the other tables we can just ignore.

### Getting the Actual Data
Let's test on the first table (Generation 1) so we can use that for all the generations later.

In [None]:
info = poke_tables[1].contents
for i in range(len(info)):
    print(f'Index {i}:\n{combine_whitespace(str(info[i]))}\n')

The earlier code helps us see that we can ignore the even indices, so let's filter them out

In [None]:
for i in range(1, len(info), 2):
    print(f'Index {i}:\n{info[i]}\n')

All the other indices, aside from index 1, contain the data we actually need, so let's skip past it. Let's now test on just Bulbasaur how to get the information on each Pokemon.

In [None]:
bulb = info[3].contents
for i in range(len(bulb)):
    print(f'Index {i}:\n {combine_whitespace(str(bulb[i]))}\n') 

#### Data to be collected:
gen dex (kdex) = index 1's text<br>
national dex (ndex) = index 3's text<br>
name = index 7's text<br>
types = index 9 and optionally 11's text<br>
URL = index 7's href attribute<br>

In [None]:
bulb[7].find('a')['href']

In [None]:
bulb_kdex = bulb[1].text.strip()
bulb_ndex = bulb[3].text.strip()
bulb_name = bulb[7].text.strip()
bulb_type1 = bulb[9].text.strip()
bulb_type2 = bulb[11].text.strip()
bulb_URL = bulb[7].find('a')['href']

bulb_json = {
    'kdex': bulb_kdex,
    'ndex': bulb_ndex,
    'name': bulb_name,
    'type1': bulb_type1,
    'type2': bulb_type2,
    'URL': bulb_URL
}

Getting the info for each Pokemon in Gen 1:

In [None]:
gen1_json = []
URL = 'https://bulbapedia.bulbagarden.net/'
for i in range(3, len(info), 2):
    pokemon = info[i].contents
    
    pok_obj = {}
    pok_obj['kdex'] = pokemon[1].text.strip()
    pok_obj['ndex'] = pokemon[3].text.strip()
    pok_obj['name'] = pokemon[7].text.strip()
    types = []
    types.append(pokemon[9].text.strip())
    if len(pokemon) > 11:
        types.append(pokemon[11].text.strip())
    pok_obj['types'] = types
    pok_obj['URL'] = URL + pokemon[7].find('a')['href']
    pok_obj['gen'] = 1
    gen1_json.append(pok_obj)

import json
json.dumps(gen1_json)

Finally getting the json for all the generations and outputting a json file:

In [None]:
pokemon_json = []
URL = 'https://bulbapedia.bulbagarden.net/'
for i in range(1, 9):
    info = poke_tables[i].contents
    for j in range(3, len(info), 2):
        pokemon = info[j].contents

        pok_obj = {}
        pok_obj['kdex'] = pokemon[1].text.strip()
        pok_obj['ndex'] = pokemon[3].text.strip()
        pok_obj['name'] = pokemon[7].text.strip()
        types = []
        types.append(pokemon[9].text.strip())
        if len(pokemon) > 11:
            types.append(pokemon[11].text.strip())
        pok_obj['types'] = types
        pok_obj['URL'] = URL + pokemon[7].find('a')['href']
        pok_obj['gen'] = i
        pokemon_json.append(pok_obj)
        
pokemon_json

In [None]:
import json
with open("pokemon.json", "w") as outfile:
    json.dump(pokemon_json, outfile)