In [None]:
#| default_exp scrape

# Getting stats

> Our first step will be obtaining the stats of the Pokémon.

I found *pokemondb* to be an interesting source of information. Our first step is going to be trying to scrape this table containing all the pokemons and their respective stats: https://pokemondb.net/pokedex/all

In [None]:
#| export
import re
import requests
from bs4 import BeautifulSoup

import pandas as pd
from fastcore.foundation import L

First we'll obtain the webpage with `requests`:

In [None]:
r = requests.get("https://pokemondb.net/pokedex/all")
r

<Response [200]>

Then, we can parse it with `BeautifulSoup`:

In [None]:
soup = BeautifulSoup(r.text, "html.parser")

First we will trying to get the header of the table. It's inside a `<thead>`, so it should be easy to get:

In [None]:
header = soup.find("thead")
header = L([th.text for th in header.find_all("th")])
header

(#10) ['#','Name','Type','Total','HP','Attack','Defense','Sp. Atk','Sp. Def','Speed']

The information we want is inside a `<table>` with `id=pokedex`, so we can use `find()` to try and find it:

In [None]:
pokedex = soup.find("table", id="pokedex")

The information inside the table is located inside `<tr>` elements, where multiple `<td>` elements are contained representing the rows. At first, we want to keep all the columns, so we will try to extract the text inside each of this elements (it will be interesting to ibtain their icons as well, but we will try that later if we fill like it).

In [None]:
cells = pokedex.find_all("td")
cells = L([cell.text for cell in cells])
cells[:4]

(#4) ['001','Bulbasaur','Grass Poison','318']

As of now, we have a *loong* list with all the data in the table, but if we want to transform it into a list of lists mantaining the table structure, we can build a new list of lists with a list comprehension:

In [None]:
table = L([cells[i:i+len(header)] for i in range(0, len(cells), len(header))])
table

(#1190) [['001', 'Bulbasaur', 'Grass Poison', '318', '45', '49', '49', '65', '65', '45'],['002', 'Ivysaur', 'Grass Poison', '405', '60', '62', '63', '80', '80', '60'],['003', 'Venusaur', 'Grass Poison', '525', '80', '82', '83', '100', '100', '80'],['003', 'Venusaur Mega Venusaur', 'Grass Poison', '625', '80', '100', '123', '122', '120', '80'],['004', 'Charmander', 'Fire ', '309', '39', '52', '43', '60', '50', '65'],['005', 'Charmeleon', 'Fire ', '405', '58', '64', '58', '80', '65', '80'],['006', 'Charizard', 'Fire Flying', '534', '78', '84', '78', '109', '85', '100'],['006', 'Charizard Mega Charizard X', 'Fire Dragon', '634', '78', '130', '111', '130', '85', '100'],['006', 'Charizard Mega Charizard Y', 'Fire Flying', '634', '78', '104', '78', '159', '115', '100'],['007', 'Squirtle', 'Water ', '314', '44', '48', '65', '50', '64', '43']...]

Finally we only have to turn this into a `DataFrame`:

In [None]:
pokedex = pd.DataFrame(table, columns=header)
pokedex.head()

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,Grass Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass Poison,525,80,82,83,100,100,80
3,3,Venusaur Mega Venusaur,Grass Poison,625,80,100,123,122,120,80
4,4,Charmander,Fire,309,39,52,43,60,50,65


Everything in our `DataFrame` is a string (because we scraped them just as plain text), so we need to turn the numbers into `int`. We will be removing the trailing spaces when only one `Type` is present with `.strip()`:

In [None]:
pokedex["#"] = pokedex["#"].astype(int)
pokedex["Type"] = pokedex["Type"].apply(lambda x: x.strip())
pokedex.iloc[:,3:] = pokedex.iloc[:,3:].astype(int)

In [None]:
pokedex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   #        1190 non-null   int64 
 1   Name     1190 non-null   object
 2   Type     1190 non-null   object
 3   Total    1190 non-null   int64 
 4   HP       1190 non-null   int64 
 5   Attack   1190 non-null   int64 
 6   Defense  1190 non-null   int64 
 7   Sp. Atk  1190 non-null   int64 
 8   Sp. Def  1190 non-null   int64 
 9   Speed    1190 non-null   int64 
dtypes: int64(8), object(2)
memory usage: 93.1+ KB


Finally, to ensure that everything has been done correctly, we can perform some checks on the data:

In [None]:
assert (pokedex.Total == pokedex.iloc[:,4:].sum(axis=1)).all()

In [None]:
assert pokedex.Type.apply(lambda x: len(x.split(" "))).max() == 2
assert pokedex.Type.apply(lambda x: len(x.split(" "))).min() == 1

Now that everything is checked and looks correct, we can save the file in `.csv` format:

In [None]:
#| notest
pokedex.to_csv("../../Data/pokedex.csv", index=False)