In [None]:
#| default_exp scrape.images

# Getting stats

> Our first step will be obtaining the stats of the Pokémon.

I found *pokemondb* to be an interesting source of information. Our first step is going to be trying to scrape this table containing all the pokemons and their respective stats: https://pokemondb.net/pokedex/all

In [None]:
#| export
import re
import requests
from bs4 import BeautifulSoup

import pandas as pd
from fastcore.foundation import L
from fastcore.xtras import Path
from fastcore.parallel import parallel
from fastprogress.fastprogress import progress_bar

First we'll obtain the webpage with `requests`:

In [None]:
r = requests.get("https://pokemondb.net/pokedex/all")
r

<Response [200]>

Then, we can parse it with `BeautifulSoup`:

In [None]:
soup = BeautifulSoup(r.text, "html.parser")

First we will trying to get the header of the table. It's inside a `<thead>`, so it should be easy to get:

In [None]:
header = soup.find("thead")
header = L([th.text for th in header.find_all("th")])
header

(#10) ['#','Name','Type','Total','HP','Attack','Defense','Sp. Atk','Sp. Def','Speed']

The information we want is inside a `<table>` with `id=pokedex`, so we can use `find()` to try and find it:

In [None]:
pokedex = soup.find("table", id="pokedex")

The links are located inside the `<td>` of class `cell-name`:

In [None]:
links = L([cell_name.a["href"] for cell_name in pokedex.find_all("td", {"class": "cell-name"})])
links

(#1190) ['/pokedex/bulbasaur','/pokedex/ivysaur','/pokedex/venusaur','/pokedex/venusaur','/pokedex/charmander','/pokedex/charmeleon','/pokedex/charizard','/pokedex/charizard','/pokedex/charizard','/pokedex/squirtle'...]

Because we don't want to have repeated data, we can keep only the unique elements:

In [None]:
links = links.unique()
links

(#1008) ['/pokedex/bulbasaur','/pokedex/ivysaur','/pokedex/venusaur','/pokedex/charmander','/pokedex/charmeleon','/pokedex/charizard','/pokedex/squirtle','/pokedex/wartortle','/pokedex/blastoise','/pokedex/caterpie'...]

Now we can iterate over the links to get all the images:

In [None]:
home = "https://pokemondb.net"
path_data = Path("../../Data/Images/Base")

In [None]:
#| export
def download_img(link):
    route = f"{home}/{link}"
    r_ = requests.get(route)
    soup_ = BeautifulSoup(r_.text, "html.parser")
    try:
        img_link = soup_.find("a", attrs={"rel": "lightbox"})["href"]
    except:
        return False
    file_name = img_link.split("/")[-1]
    with open(path_data / file_name, "wb") as f: 
        f.write(requests.get(img_link).content)
    return True

We can parallelize the process of downloading the images:

In [None]:
downloads = parallel(download_img, links, progress=True)
downloads

                                                                           

(#1008) [True,True,True,True,True,True,True,True,True,True...]

In [None]:
sum(downloads)

922

We see that not all the images could be downloaded, but we got a good bunch of them. This could require some further investigation but, as of now, we're happy with what we have.