#**Creating Pokemon Data via Web Scraping**

by Lena Horsley

In [None]:
!conda install -c anaconda beautifulsoup4

/bin/bash: conda: command not found


In [None]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup

In [None]:
#Get the page
#URL = "https://serebii.net/pokemongo/gen1pokemon.shtml"
#URL = "https://serebii.net/pokemongo/gen2pokemon.shtml"
#URL = "https://serebii.net/pokemongo/gen3pokemon.shtml"
#URL = "https://serebii.net/pokemongo/gen4pokemon.shtml"
#URL = "https://serebii.net/pokemongo/gen5pokemon.shtml"
#URL = "https://serebii.net/pokemongo/gen6pokemon.shtml"
#URL = "https://serebii.net/pokemongo/gen7pokemon.shtml"
#URL = "https://serebii.net/pokemongo/gen8pokemon.shtml"
#URL = "https://serebii.net/pokemongo/unknownpokemon.shtml"
#URL = "https://serebii.net/pokemongo/megaevolution.shtml"
URL = "https://serebii.net/pokemongo/hisuipokemon.shtml"

page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")


In [None]:
#In reviewing the page source, we know the tables have the class "tab." Let's get those...
print('Classes of each table:')
for table in soup.find_all(class_="tab"):
    print(table.get('class'))

Classes of each table:
['tab']
['tab']


In [None]:
# We want the second table in the list (it contains the pokemon data).
# Use this little bit of code to check the table
my_table = soup.find_all(class_="tab")[1]
#print("My table...")
#print(my_table)

In [None]:
# Get all of the data from the table of interest and put it in a list
raw_data_list = []

for sibling in soup.find_all(class_="tab")[1].tr.next_siblings:
  raw_data_list.append(sibling)

print("My list: ", len(raw_data_list))

My list:  18


In [None]:
# By inspecting the soup object we know there's some stuff we need to remove
#raw_data_list[4]

In [None]:
# Remove the new line chracters. This list contains our data!
clean_data_list = list(filter(lambda x: x!= "\n", raw_data_list))
len(clean_data_list)

9

In [None]:
#This method organizes the types and moves data for one pokemon
def organize_pokemon_info(my_link_urls):
    my_pokemon_info = []
      #print(list_index)

    pokemon_moves = ""  
    pokemon_type = ""
      #pokemon_name = " "
    substring = "moves.shtml"
    for list_index in range(0, len(my_link_urls)):
      
      if (list_index == 0):
        pokemon_tokens = (re.split('\W',my_link_urls[list_index]["href"]))
        pokedex = pokemon_tokens[3]

      elif (list_index == 1):
        pokemon_name = my_link_urls[list_index].text
      
      elif (substring in my_link_urls[list_index]["href"]):
        my_moves = my_link_urls[list_index].text
        pokemon_moves = pokemon_moves + " " + my_moves + ","

      else:
        pokemon_link_info = (re.split('\W',my_link_urls[list_index]["href"]))
        my_type = pokemon_link_info[3]
        pokemon_type = pokemon_type + " " + my_type + " "
      
      my_type = ""
      my_moves = ""
      
    my_pokemon_info.append(pokedex)
    my_pokemon_info.append(pokemon_name)
    my_pokemon_info.append(pokemon_type)
    my_pokemon_info.append(pokemon_moves)

    return my_pokemon_info

In [None]:
#Get the region based on the web link. this version of Python doesn't have a switch statement, so you need to use a 
#dictionary
def get_pokemon_region(my_pokemon_region):

  switcher = {
        "gen1pokemon": "Kanto",
        "gen2pokemon": "Johto",
        "gen3pokemon": "Hoenn",
        "gen4pokemon": "Sinnoh",
        "gen5pokemon": "Unova",
        "gen6pokemon": "Kalos",
        "gen7pokemon": "Alola",
        "gen8pokemon": "Galar",
        "unknownpokemon": "unknown",
        "megaevolution": "Mega-Evolution",
        "hisuipokemon": "Hisui",
    }
 
    # get() method of dictionary data type returns
    # value of passed argument if it is present
    # in dictionary otherwise second argument will
    # be assigned as default value of passed argument
  return switcher.get(my_pokemon_region, "unknown")

In [None]:
pokedex_list = []

url_tokens = (re.split('\W',URL))
pokemon_region = get_pokemon_region(url_tokens[6])

for index in range(0, len(clean_data_list)):
  #Get the links for the pokemon...the links contain the moves and types data
  link_urls = clean_data_list[index].find_all("a")

  #Now get the stats (columns in the web page)
  check_pokemon_stats = clean_data_list[index].find_all("tr")

  pokemon_info = organize_pokemon_info(link_urls)

  #Add the stats to the pokemon
  for i in range(1, len(check_pokemon_stats)):
    pokemon_info.append(check_pokemon_stats[i].find_all("td")[1].text)

  #Add the region
  pokemon_info.append(pokemon_region)
  
  #Add the pokemon to the pokedex
  pokedex_list.append(pokemon_info)

#Now, add the column names
pokedex_dataframe = pd.DataFrame(pokedex_list, columns =['num','name','poke_type','attacks','hp','attack','defense','max_cp','max_buddy_cp','region'])

In [None]:
pokedex_dataframe

Unnamed: 0,num,name,poke_type,attacks,hp,attack,defense,max_cp,max_buddy_cp,region
0,899,Wyrdeer,normal psychic,,230,206,145,3089,3126,Hisui
1,900,Kleavor,bug rock,,172,253,174,3557,3600,Hisui
2,901,Ursaluna,ground normal,,277,243,181,4358,4410,Hisui
3,902,Basculegion,water ghost,,260,217,144,3425,3466,Hisui
4,902,Basculegion,water ghost,,260,199,144,3159,3197,Hisui
5,903,Sneasler,fighting poison,,190,259,158,3643,3687,Hisui
6,904,Overqwil,dark poison,,198,222,171,3330,3370,Hisui
7,905,Enamorus,fairy flying,,179,281,162,3872,3919,Hisui
8,905,Enamorus,fairy flying,,179,250,201,3830,3876,Hisui


In [None]:
#Add type1, type2, boost1, boost2, and boost columns
pokedex_dataframe['type1'] = ""
pokedex_dataframe['type2'] = ""

#This works like a switch statement
replacements = {
  r'.*grass.*': 'sunny/clear',
  r'.*ground.*': 'sunny/clear',
  r'.*fire.*': 'sunny/clear',
  r'.*water.*': 'rain',
  r'.*electric.*': 'rain',
  r'.*bug.*': 'rain',
  r'.*normal.*': 'partly cloudy',
  r'.*rock.*': 'partly cloudy',
  r'.*fairy.*': 'cloudy',
  r'.*fight.*': 'cloudy',
  r'.*poison.*': 'cloudy',
  r'.*dragon.*': 'windy',
  r'.*flying.*': 'windy',
  r'.*psychic.*': 'windy',
  r'.*ice.*': 'snow',
  r'.*steel.*': 'snow',
  r'.*dark.*': 'fog',
  r'.*ghost.*': 'fog'
}

pokedex_dataframe['boost1'] = ""
pokedex_dataframe['boost2'] = ""
pokedex_dataframe['boost'] = ""

#Split poke_type into type1 and type2
for index in pokedex_dataframe.index:
    weather_boost_string = ''
    type_tokens = pokedex_dataframe['poke_type'][index].split()
    if len(type_tokens) == 2:
      pokedex_dataframe['type1'][index] = type_tokens[0]
      pokedex_dataframe['type2'][index] = type_tokens[1]
    elif len(type_tokens) == 1:
      pokedex_dataframe['type1'][index] = type_tokens[0]
      pokedex_dataframe['type2'][index] = ""
    else:
      pokedex_dataframe['type1'][index] = ""
      pokedex_dataframe['type2'][index] = ""

#Create boost1 from type1. Create boost2 from type2
pokedex_dataframe['boost1'] = pokedex_dataframe.type1.replace(replacements, regex=True)
pokedex_dataframe['boost2'] = pokedex_dataframe.type2.replace(replacements, regex=True)

#Create boost from boost1 and boost2
pokedex_dataframe['boost'] = pokedex_dataframe['boost1'] + " " + pokedex_dataframe['boost2']

pokedex_dataframe
pokedex_dataframe.to_csv(pokemon_region + ".csv", index=False)

References
* [Beautiful Soup: Build a Web Scraper With Python](https://realpython.com/beautiful-soup-web-scraper-python/)
* [A Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
* [Jupiter notebook and BeautifulSoup4 installation](https://stackoverflow.com/questions/54781462/jupiter-notebook-and-beautifulsoup4-installation)
* [List of Pokémon by National Pokédex number](https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number)
* [A Guide to Scraping HTML Tables with Pandas and BeautifulSoup](https://towardsdatascience.com/a-guide-to-scraping-html-tables-with-pandas-and-beautifulsoup-7fc24c331cf7)
* [Is there a simple way to delete a list element by value?](https://stackoverflow.com/questions/2793324/is-there-a-simple-way-to-delete-a-list-element-by-value)
* [Split string with multiple delimiters in Python [duplicate]](https://stackoverflow.com/questions/4998629/split-string-with-multiple-delimiters-in-python)
* [How to use multiple cases in Match (switch in other languages) cases in Python 3.10](https://stackoverflow.com/questions/69642889/how-to-use-multiple-cases-in-match-switch-in-other-languages-cases-in-python-3)
* [What is the syntactical equivalent to switch/case in Python? [duplicate]](https://stackoverflow.com/questions/66877130/what-is-the-syntactical-equivalent-to-switch-case-in-python)
* [Python: Check if String Contains Substring](https://stackabuse.com/python-check-if-string-contains-substring/)
* [How to add one row in an existing Pandas DataFrame?](https://www.geeksforgeeks.org/how-to-add-one-row-in-an-existing-pandas-dataframe/)
* [How to append a list as a row to a Pandas DataFrame in Python?](https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/)
* [Create a Pandas DataFrame from Lists](https://www.geeksforgeeks.org/create-a-pandas-dataframe-from-lists/)
* [Assign value to a pandas dataframe column based on string condition](https://stackoverflow.com/questions/36701689/assign-value-to-a-pandas-dataframe-column-based-on-string-condition)