# PRELIMINARIES

In [1]:
import bs4 as bs
import urllib.request
import pickle
import pandas as pd
import numpy as np
#website to scrape
sauce = urllib.request.urlopen('https://www.serebii.net/pokemon/nationalpokedex.shtml').read()

we use the package beautiful soup (bs4) for our web scraping.

In [2]:
soup = bs.BeautifulSoup(sauce,'lxml')

# SCRAPE NATIONALDEX FROM SEREBII.COM

By examining our source webpage we can see that there is just one table in the whole page which makes our life easier (note for scraping moves we will need to deal with multiple tables). So firstly we can skip collecting tables on the page and instead collect the table data stored in \<td\> tags in html. By using the len() command we can see how many bits of table data we will be scraping.

In [3]:
table = soup.find('table')
table_data=table.find_all('td')
len(table_data)

10872

Again from examining the page we will scrape we know the table has 12 columns and we want to scrape 10 of those columns. The pokemon image and a decieving but empty column are ignored. We can loop through the list of table_data and seperate the data into different arrays.

In [4]:
num = []
name = []
ability = []
hp = []
atk = []
spatk = []
defence = []
spdef = []
spd = []
typ = []
i = 0
for td in table_data:
    if i%12 == 11:
        spd.append(td.text)
    if i%12 == 0:
        num.append(td.text)
    if i%12 == 3:
        name.append(td.text)
    if i%12 == 4:
        a = td.find_all('a')
        typ.append(a)
    if i%12 == 5:
        a = td.find_all('a')
        ability.append(a)
    if i%12 == 6:
        hp.append(td.text)
    if i%12 == 7:
        atk.append(td.text)
    if i%12 == 8:
        defence.append(td.text)
    if i%12 == 9:
        spatk.append(td.text)
    if i%12 == 10:
        spdef.append(td.text)
    i = i+1

We now have several arrays of data. We now create our dataframe using these arrays as columns in the table. As it has been scraped direct from HTML there may be some tags which we don't need to use such as '\n','\t','\r','#' and we can remove these. We also remove teh first row of the dataframe as we want to be able to use the dataframe for numerical calculations and the first row would add string variables to these columns.

In [5]:
df = pd.DataFrame({'ID':num})
df['NAME'] = name
df['HP'] = hp
df['ATK'] = atk
df['DEF'] = defence
df['SPATK'] = spatk
df['SPDEF'] = spdef
df['SPD'] = spd
df['TYPE'] = typ
df['ABILITY'] = ability
df = df.replace('\n','', regex=True)
df = df.replace('\t','', regex=True)
df = df.replace('\r','', regex=True)
df = df.replace('#','', regex=True)
df = df.iloc[1:]
df

Unnamed: 0,ID,NAME,HP,ATK,DEF,SPATK,SPDEF,SPD,TYPE,ABILITY
1,001,Bulbasaur,45,49,49,65,65,45,"[[[]], [[]]]","[[Overgrow], [Chlorophyll]]"
2,002,Ivysaur,60,62,63,80,80,60,"[[[]], [[]]]","[[Overgrow], [Chlorophyll]]"
3,003,Venusaur,80,82,83,100,100,80,"[[[]], [[]]]","[[Overgrow], [Chlorophyll]]"
4,004,Charmander,39,52,43,60,50,65,[[[]]],"[[Blaze], [Solar Power]]"
5,005,Charmeleon,58,64,58,80,65,80,[[[]]],"[[Blaze], [Solar Power]]"
...,...,...,...,...,...,...,...,...,...,...
901,901,Ursaluna,130,140,105,45,80,50,"[[[]], [[]]]","[[Guts], [Bulletproof], [Unnerve]]"
902,902,Basculegion,120,112,65,80,75,78,"[[[]], [[]]]","[[Rattled], [Adaptability], [Mold Breaker]]"
903,903,Sneasler,80,130,60,40,80,120,"[[[]], [[]]]","[[Pressure], [Poison Touch]]"
904,904,Overqwil,85,115,95,65,65,85,"[[[]], [[]]]","[[Poison Point], [Swift Swim], [Intimidate]]"


# Seperating Merged Columns

Currently the type and ability columns of our dataframe contain multiple values. We want to seperate these into their own columns and we can do that by using the .tolist() function and assigning several columns. We then drop the original merged columns of our dataframe 'TYPE' and 'ABILITY'.

In [6]:
# seperate ability from one column to three
df[['TYPE1','TYPE2']] = pd.DataFrame(df.TYPE.tolist(), index= df.index)
df[['ABILITY1','ABILITY2','ABILITY3']] = pd.DataFrame(df.ABILITY.tolist(), index= df.index)
df = df.drop(['ABILITY'],axis=1)
df = df.drop(['TYPE'],axis=1)
df.head(5)

Unnamed: 0,ID,NAME,HP,ATK,DEF,SPATK,SPDEF,SPD,TYPE1,TYPE2,ABILITY1,ABILITY2,ABILITY3
1,1,Bulbasaur,45,49,49,65,65,45,[[]],[[]],[Overgrow],[Chlorophyll],
2,2,Ivysaur,60,62,63,80,80,60,[[]],[[]],[Overgrow],[Chlorophyll],
3,3,Venusaur,80,82,83,100,100,80,[[]],[[]],[Overgrow],[Chlorophyll],
4,4,Charmander,39,52,43,60,50,65,[[]],,[Blaze],[Solar Power],
5,5,Charmeleon,58,64,58,80,65,80,[[]],,[Blaze],[Solar Power],


# Convert column types (bool, int, str, etc).

In [7]:
df.dtypes

ID          object
NAME        object
HP          object
ATK         object
DEF         object
SPATK       object
SPDEF       object
SPD         object
TYPE1       object
TYPE2       object
ABILITY1    object
ABILITY2    object
ABILITY3    object
dtype: object

Now that we have our dataframe in order to use them we probably want to convert them out of the object type and into types such as int64 so we can use operators (>, <, >=, <=, ==, etc) for comparison. We also replace the remaining HTML tags still hanging around using regex formulas.

In [8]:
#convert columns to different types
df['HP']=df['HP'].values.astype(str).astype(int)
df['ATK']=df['ATK'].values.astype(str).astype(int)
df['DEF']=df['DEF'].values.astype(str).astype(int)
df['SPATK']=df['SPATK'].values.astype(str).astype(int)
df['SPDEF']=df['SPDEF'].values.astype(str).astype(int)
df['SPD']=df['SPD'].values.astype(str).astype(int)
df['NAME']=df['NAME'].values.astype(str)
df['ID']=df['ID'].values.astype(str).astype(int)
df['ABILITY1']=df['ABILITY1'].astype(str).replace('<[^>]*>','', regex=True) #remove html tags (assuming no characters inside the text)
df['ABILITY2']=df['ABILITY2'].astype(str).replace('<[^>]*>','', regex=True) #remove html tags (assuming no characters inside the text)
df['ABILITY3']=df['ABILITY3'].astype(str).replace('<[^>]*>','', regex=True) #remove html tags (assuming no characters inside the text)
df['TYPE1'] = df['TYPE1'].astype(str).replace('<a href="/pokemon/type/','',regex=True).replace('".*','',regex=True)
df['TYPE2'] = df['TYPE2'].astype(str).replace('<a href="/pokemon/type/','',regex=True).replace('".*','',regex=True)

In [9]:
df.dtypes

ID           int64
NAME        object
HP           int64
ATK          int64
DEF          int64
SPATK        int64
SPDEF        int64
SPD          int64
TYPE1       object
TYPE2       object
ABILITY1    object
ABILITY2    object
ABILITY3    object
dtype: object

In [10]:
df

Unnamed: 0,ID,NAME,HP,ATK,DEF,SPATK,SPDEF,SPD,TYPE1,TYPE2,ABILITY1,ABILITY2,ABILITY3
1,1,Bulbasaur,45,49,49,65,65,45,grass,poison,Overgrow,Chlorophyll,
2,2,Ivysaur,60,62,63,80,80,60,grass,poison,Overgrow,Chlorophyll,
3,3,Venusaur,80,82,83,100,100,80,grass,poison,Overgrow,Chlorophyll,
4,4,Charmander,39,52,43,60,50,65,fire,,Blaze,Solar Power,
5,5,Charmeleon,58,64,58,80,65,80,fire,,Blaze,Solar Power,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
901,901,Ursaluna,130,140,105,45,80,50,ground,normal,Guts,Bulletproof,Unnerve
902,902,Basculegion,120,112,65,80,75,78,water,ghost,Rattled,Adaptability,Mold Breaker
903,903,Sneasler,80,130,60,40,80,120,fighting,poison,Pressure,Poison Touch,
904,904,Overqwil,85,115,95,65,65,85,dark,poison,Poison Point,Swift Swim,Intimidate


# Additional Columns

Now our dataframe is mostly complete we can add some of our own useful columns to the dataframe. One of these is 'TOTAL' a summation of all the pokemon's base stats.

In [11]:
df['TOTAL'] = df['HP']+df['ATK']+df['DEF']+df['SPATK']+df['SPDEF']+df['SPD']

Another useful column is the generation of the pokemon. Using external sources we can see how many pokemon there are in each generation and create a loop to assign pokemon to the correct generation.

In [12]:
generation = []
gen1 = 151
gen2 = 251
gen3 = 386
gen4 = 493
gen5 = 649
gen6 = 721
gen7 = 809
gen8 = 905
i = 1

while i < 906:
    if i <= gen1:
        generation.append(1)
    elif i <= gen2:
        generation.append(2)
    elif i <= gen3:
        generation.append(3)
    elif i <= gen4:
        generation.append(4)
    elif i <= gen5:
        generation.append(5)
    elif i <= gen6:
        generation.append(6)
    elif i <= gen7:
        generation.append(7)
    else:
        generation.append(8)
    i = i+1
df['GENERATION'] = generation

Finally it may be useful to consider legendary pokemon seperate to normal pokemon since they will often dominate when it comes to base stats being usually the strongest pokemon in the game as well as harder to find with usually just one in the game.

In [13]:
legendary = []
i = 1

while i < 906:
    if (i>= 144 and i <=146) or (i>= 150 and i <=151) or (i>= 243 and i <=245) or (i>= 249 and i <=251) or (i>= 377 and i <=386) or (i>= 480 and i <=494) or (i>= 638 and i <=649) or (i>= 716 and i <=721) or (i>= 772 and i <=773) or (i>= 785 and i <=809) or (i>= 888 and i <=898) or (i == 905):
        legendary.append(True)
    else:
        legendary.append(False)
    i = i+1
df['LEGENDARY'] = legendary

We can store this dataframe using pickle. This downloaded version can then be kept if the website updates in the future.

In [14]:
with open ('uptogen8pokemon.pickle', 'wb') as f:
        pickle.dump(df,f)

In [15]:
df

Unnamed: 0,ID,NAME,HP,ATK,DEF,SPATK,SPDEF,SPD,TYPE1,TYPE2,ABILITY1,ABILITY2,ABILITY3,TOTAL,GENERATION,LEGENDARY
1,1,Bulbasaur,45,49,49,65,65,45,grass,poison,Overgrow,Chlorophyll,,318,1,False
2,2,Ivysaur,60,62,63,80,80,60,grass,poison,Overgrow,Chlorophyll,,405,1,False
3,3,Venusaur,80,82,83,100,100,80,grass,poison,Overgrow,Chlorophyll,,525,1,False
4,4,Charmander,39,52,43,60,50,65,fire,,Blaze,Solar Power,,309,1,False
5,5,Charmeleon,58,64,58,80,65,80,fire,,Blaze,Solar Power,,405,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
901,901,Ursaluna,130,140,105,45,80,50,ground,normal,Guts,Bulletproof,Unnerve,550,8,False
902,902,Basculegion,120,112,65,80,75,78,water,ghost,Rattled,Adaptability,Mold Breaker,530,8,False
903,903,Sneasler,80,130,60,40,80,120,fighting,poison,Pressure,Poison Touch,,510,8,False
904,904,Overqwil,85,115,95,65,65,85,dark,poison,Poison Point,Swift Swim,Intimidate,510,8,False
