In [None]:
#| default_exp scrape.sprites

# Getting sprites

> Let's scrape some pokemon sprites and their shiny counter parts.

I found *pokemondb* to be an interesting source of information. Our first step is going to be trying to scrape this table containing all the pokemons and their respective stats: https://pokemondb.net/pokedex/all

In [None]:
#| export
import re
import requests
from bs4 import BeautifulSoup

import pandas as pd
from fastcore.foundation import L
from fastcore.xtras import Path
from fastcore.parallel import parallel
from fastprogress.fastprogress import progress_bar

First we'll obtain the webpage with `requests`. To access the sprites page, we will have to add `#dex-sprites` to the specific pokemon url.

In [None]:
r = requests.get("https://pokemondb.net/pokedex/all")
r

<Response [200]>

Then, we can parse it with `BeautifulSoup`:

In [None]:
soup = BeautifulSoup(r.text, "html.parser")

First we will trying to get the header of the table. It's inside a `<thead>`, so it should be easy to get:

In [None]:
header = soup.find("thead")
header = L([th.text for th in header.find_all("th")])
header

(#10) ['#','Name','Type','Total','HP','Attack','Defense','Sp. Atk','Sp. Def','Speed']

The information we want is inside a `<table>` with `id=pokedex`, so we can use `find()` to try and find it:

In [None]:
pokedex = soup.find("table", id="pokedex")

The links are located inside the `<td>` of class `cell-name`:

In [None]:
links = L([cell_name.a["href"]+"#dex-sprites" for cell_name in pokedex.find_all("td", {"class": "cell-name"})])
links

(#1190) ['/pokedex/bulbasaur#dex-sprites','/pokedex/ivysaur#dex-sprites','/pokedex/venusaur#dex-sprites','/pokedex/venusaur#dex-sprites','/pokedex/charmander#dex-sprites','/pokedex/charmeleon#dex-sprites','/pokedex/charizard#dex-sprites','/pokedex/charizard#dex-sprites','/pokedex/charizard#dex-sprites','/pokedex/squirtle#dex-sprites'...]

Because we don't want to have repeated data, we can keep only the unique elements:

In [None]:
links = links.unique()
links

(#1008) ['/pokedex/bulbasaur#dex-sprites','/pokedex/ivysaur#dex-sprites','/pokedex/venusaur#dex-sprites','/pokedex/charmander#dex-sprites','/pokedex/charmeleon#dex-sprites','/pokedex/charizard#dex-sprites','/pokedex/squirtle#dex-sprites','/pokedex/wartortle#dex-sprites','/pokedex/blastoise#dex-sprites','/pokedex/caterpie#dex-sprites'...]

Now we can iterate over the links to get all the images:

In [None]:
home = "https://pokemondb.net"
path_data = Path("../../Data/Images/Sprites")

Inside this page, we are interested in the sprites per generation table, which can be found by the `<table class="data-table sprites-table sprites-history-table">` tag.

The **header** of the table (`<thead>`) tells us which generation are we looking at, and the column `Type` will tell us if we're looking at the normal or shiny version. Keep in mind that not all generations sprites are present.

Meanwhile, the **body** of the table contains two rows (`<tr>`) with the sprites. The first element of the row tells us if we're in the *Normal* or *Shiny* row.

In [None]:
#| exporti
def get_sprites(row, # Row of a sprites table.
                ):
    tds = row.find_all("td")
    sprites = L([td.find("a")["href"] if td.find("a") is not None else None for td in tds[1:]])
    sprites.insert(0, tds[0].text)
    return sprites

We'll put everything inside a function so that we can use `parallel` to speed up the process:

In [None]:
#| export
def build_df_sprites(link):
    index = []
    pokemon_name = link.split("/")[-1].split("#")[0]
    route = f"{home}/{link}"
    r_ = requests.get(route)
    soup_ = BeautifulSoup(r_.text, "html.parser")
    sprites_table = soup_.find("table", attrs={"class": "data-table sprites-table sprites-history-table"})
    sprites_header = L([th.text for th in sprites_table.find("thead").find_all("th")])
    sprites_header.map(lambda x: x.replace(" ", "_"))
    normal_row, shiny_row = sprites_table.find("tbody").find_all("tr")
    sprites_normal, sprites_shiny = get_sprites(normal_row), get_sprites(shiny_row)
    index.append((pokemon_name, sprites_normal[0]))
    index.append((pokemon_name, sprites_shiny[0]))
    df = pd.DataFrame([sprites_normal, sprites_shiny], columns=sprites_header, index=pd.MultiIndex.from_tuples(index, names=["Pokemon", "Type"]))
    df.drop("Type", axis=1, inplace=True)
    return df

In [None]:
dfs = parallel(build_df_sprites, links, progress=True)
dfs = pd.concat(dfs, axis=0)
dfs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Generation 1,Generation 2,Generation 3,Generation 4,Generation 5,Generation 6,Generation 7,Generation 8,Generation 9
Pokemon,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
bulbasaur,Normal,https://img.pokemondb.net/sprites/red-blue/nor...,https://img.pokemondb.net/sprites/silver/norma...,https://img.pokemondb.net/sprites/ruby-sapphir...,https://img.pokemondb.net/sprites/diamond-pear...,https://img.pokemondb.net/sprites/black-white/...,https://img.pokemondb.net/sprites/x-y/normal/b...,,https://img.pokemondb.net/sprites/sword-shield...,
bulbasaur,Shiny,,https://img.pokemondb.net/sprites/silver/shiny...,https://img.pokemondb.net/sprites/ruby-sapphir...,https://img.pokemondb.net/sprites/diamond-pear...,https://img.pokemondb.net/sprites/black-white/...,https://img.pokemondb.net/sprites/x-y/shiny/bu...,,,
ivysaur,Normal,https://img.pokemondb.net/sprites/red-blue/nor...,https://img.pokemondb.net/sprites/silver/norma...,https://img.pokemondb.net/sprites/ruby-sapphir...,https://img.pokemondb.net/sprites/diamond-pear...,https://img.pokemondb.net/sprites/black-white/...,https://img.pokemondb.net/sprites/x-y/normal/i...,,https://img.pokemondb.net/sprites/sword-shield...,
ivysaur,Shiny,,https://img.pokemondb.net/sprites/silver/shiny...,https://img.pokemondb.net/sprites/ruby-sapphir...,https://img.pokemondb.net/sprites/diamond-pear...,https://img.pokemondb.net/sprites/black-white/...,https://img.pokemondb.net/sprites/x-y/shiny/iv...,,,
venusaur,Normal,https://img.pokemondb.net/sprites/red-blue/nor...,https://img.pokemondb.net/sprites/silver/norma...,https://img.pokemondb.net/sprites/ruby-sapphir...,https://img.pokemondb.net/sprites/diamond-pear...,https://img.pokemondb.net/sprites/black-white/...,https://img.pokemondb.net/sprites/x-y/normal/v...,,https://img.pokemondb.net/sprites/sword-shield...,


In [None]:
(~dfs.isna()).sum().sum()

7228

Now what's left is downloading the data:

Now that we've figured almost everything up, we have to think about how do we want to store the data.

There are two main possibilities that come to mind:

1. A folder per pokemon.
2. A folder per generation.

We're going to go with option 2 as of now.

In [None]:
def download_series(row):
    idx, row = row
    row = row[~row.isna()]
    file_name_base = "_".join(idx)
    for folder, link in progress_bar(row.items(), total=len(row)):
        extension = link.split(".")[-1]
        file_name = file_name_base + "." + extension
        img_path = path_data / folder / file_name
        ## Download the image
        img = requests.get(link).content
        ## Save the image
        img_path.parent.mkdir(exist_ok=True, parents=True)
        with open(img_path, "wb") as f: f.write(img)

In [None]:
results = parallel(download_series, dfs.iterrows(), progress=True, total=len(dfs))

Let's see how many sprites did we download:

In [None]:
len(list(path_data.glob("*/*")))

7228