# Natalia Palej A00279259
Software Design with Artificial Intelligence for Cloud Computing 
Year 4

# Part 1: Data Collection Through Web Scraping

## Make Necessary Imports

In [27]:
# Needed to make HTTP request
import requests
# Expression for text manipultion
import re
# Needed to add delay between requests 
import time
import json
# BeautifulSoup needed for parsing HTML content
from bs4 import BeautifulSoup

## Function to Extract Specific HTML Element

That function takes CSS selector as input and returns the text of the first element that matches it.
If no matching found, it returns 'N/A'.

In [28]:
def get_text(selector):
    element = pokemon_soup.select_one(selector)
    return element.text.strip() if element else 'N/A'

## Make Request To Pokemon Page

In [29]:
url = 'https://pokemondb.net/pokedex/all'

In [30]:
print("Starting Data Scraping for...", url)

Starting Data Scraping for... https://pokemondb.net/pokedex/all


In [31]:
# Request to fetch the webpage
response = requests.get(url)
# Raise error if server responds with error (eg 500)
response.raise_for_status()

# Parse the content with BeautifulSoup
pokemon_all = BeautifulSoup(response.content, 'html.parser')

## Data Extraction

Find all Pokemon links by looking for elements within table rows 

In [32]:
pokemon_links = []
for i in pokemon_all.select('table#pokedex tbody tr a[href^="/pokedex/"]'):
    pokemon_links.append(i['href'])

## Verify Links Were Extracted Properly

In [33]:
print("\nPrinting first 10 pokemon_links...")
index = 0
for link in pokemon_links[:10]:
    index += 1
    print(index, ":", link)


Printing first 10 pokemon_links...
1 : /pokedex/bulbasaur
2 : /pokedex/ivysaur
3 : /pokedex/venusaur
4 : /pokedex/venusaur
5 : /pokedex/charmander
6 : /pokedex/charmeleon
7 : /pokedex/charizard
8 : /pokedex/charizard
9 : /pokedex/charizard
10 : /pokedex/squirtle


## Individual Pokemon Scraping

Stored data for each Pokemon in dictionary 

In [39]:
pokemon_data = {}

### Limit Scraping to 500 Pokemons

Set the limit to scrape max 500 Pokemons (it was taking too long to scrape all)

In [40]:
max_pokemons = 500
print()




### Loop Through Each Link To Scrape Individual Pokemon

Comments added within the below code block. There is NOTE for attribute "move"

In [35]:
for link in pokemon_links[:max_pokemons]:
    # Wait for 5 seconds between requests
    time.sleep(5)
    pokemon_link = 'https://pokemondb.net' + link
    response = requests.get(pokemon_link)
    response.raise_for_status()
    pokemon_soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the Data
    # Pokedex Data
    name = pokemon_soup.find('h1').text
    national_no = get_text('table.vitals-table tr:-soup-contains("National №") td')
    
    # Scraping types
    type_elements = pokemon_soup.select('table.vitals-table tr:-soup-contains("Type") td a')
    types = [type_element.get_text(strip=True).lower() for type_element in type_elements]
    species = get_text('table.vitals-table tr:-soup-contains("Species") td')
    height = get_text('table.vitals-table tr:-soup-contains("Height") td').replace('\xa0', ' ').strip()
    weight = get_text('table.vitals-table tr:-soup-contains("Weight") td').replace('\xa0', ' ').strip()

    # Modify the extraction for abilities
    abilities = get_text('table.vitals-table tr:-soup-contains("Abilities") td')
    # Remove numbers and hidden text
    abilities_cleaned = re.sub(r'\d+\.\s*|\s*\(hidden ability\)', '', abilities)
    # Add comma before capital letter
    abilities_cleaned = re.sub(r'(?<=\w)(?=[A-Z])', ', ', abilities_cleaned).strip()
    
    # Training
    ev_yield = get_text('table.vitals-table tr:-soup-contains("EV yield") td')
    catch_rate = get_text('table.vitals-table tr:-soup-contains("Catch rate") td')
    base_friendship = get_text('table.vitals-table tr:-soup-contains("Base") td')
    base_exp = get_text('table.vitals-table tr:-soup-contains("Base Exp.") td')
    growth_rate = get_text('table.vitals-table tr:-soup-contains("Growth Rate") td')
    
    # Breeding
    egg_groups = get_text('table.vitals-table tr:-soup-contains("Egg Groups") td')
    gender = get_text('table.vitals-table tr:-soup-contains("Gender") td')
    egg_cycles = get_text('table.vitals-table tr:-soup-contains("Egg cycles") td').strip()
    
    # Base Stats
    hp = get_text('table.vitals-table tr:-soup-contains("HP") td.cell-num:nth-of-type(1)')
    min_hp = get_text('table.vitals-table tr:-soup-contains("HP") td.cell-num:nth-of-type(3)')
    max_hp = get_text('table.vitals-table tr:-soup-contains("HP") td.cell-num:nth-of-type(4)')
    
    attack = get_text('table.vitals-table tr:-soup-contains("Attack") td.cell-num:nth-of-type(1)')
    min_attack = get_text('table.vitals-table tr:-soup-contains("Attack") td.cell-num:nth-of-type(3)')
    max_attack = get_text('table.vitals-table tr:-soup-contains("Attack") td.cell-num:nth-of-type(4)')

    defense = get_text('table.vitals-table tr:-soup-contains("Defense") td.cell-num:nth-of-type(1)')
    min_defense = get_text('table.vitals-table tr:-soup-contains("Defense") td.cell-num:nth-of-type(3)')
    max_defense = get_text('table.vitals-table tr:-soup-contains("Defense") td.cell-num:nth-of-type(4)')
    
    sp_atk = get_text('table.vitals-table tr:-soup-contains("Sp. Atk") td.cell-num:nth-of-type(1)')
    min_sp_atk = get_text('table.vitals-table tr:-soup-contains("Sp. Atk") td.cell-num:nth-of-type(3)')
    max_sp_atk = get_text('table.vitals-table tr:-soup-contains("Sp. Atk") td.cell-num:nth-of-type(4)')
    
    sp_def = get_text('table.vitals-table tr:-soup-contains("Sp. Def") td.cell-num:nth-of-type(1)')
    min_sp_def = get_text('table.vitals-table tr:-soup-contains("Sp. Def") td.cell-num:nth-of-type(3)')
    max_sp_def = get_text('table.vitals-table tr:-soup-contains("Sp. Def") td.cell-num:nth-of-type(4)')
    
    speed = get_text('table.vitals-table tr:-soup-contains("Speed") td.cell-num:nth-of-type(1)')
    min_speed = get_text('table.vitals-table tr:-soup-contains("Speed") td.cell-num:nth-of-type(3)')
    max_speed = get_text('table.vitals-table tr:-soup-contains("Speed") td.cell-num:nth-of-type(4)')
    
    total = get_text('table.vitals-table tfoot tr th:-soup-contains("Total") + td.cell-num.cell-total')
    
    # Evo Chart
    evo_elements = pokemon_soup.select('div.infocard-list-evo div.infocard a.ent-name')
    evo_path = [evo_element.get_text(strip=True) for evo_element in evo_elements]

    ######## NOTE ########
    ### Im removing "moves" from scraped data purposely, as this field is quite excessive. It makes the rest of my assignment messy. ###
    ### To see that scraping for "moves" works, please uncomment the lines "move_elements" and "moves" 
    ### "moves" also needs to be uncommented in pokemon_data
    ######## NOTE ########
    # Moves
    # move_elements = pokemon_soup.select('table.data-table tbody tr td.cell-name a.ent-name')
    # moves = [move_element.get_text(strip=True) for move_element in move_elements]
    
    # Pokemon Image
    img_tag = pokemon_soup.select_one('div.grid-col.span-md-6.span-lg-4.text-center img')
    if img_tag:
        pokemon_img_url = img_tag['src']
    else:
        pokemon_img_url = None

    # Store collected data in dic 
    pokemon_data[name.lower()] = {
        "name": name.lower(),
        "national_no": national_no,
        "types": types,
        "species": species,
        "height": height,
        "weight": weight,
        "abilities": abilities_cleaned,
        "ev_yield": ev_yield,
        "catch_rate": catch_rate,
        "base_friendship": base_friendship,
        "base_exp": base_exp,
        "growth_rate": growth_rate,
        "egg_groups": egg_groups,
        "gender": gender,
        "egg_cycles": egg_cycles.replace("\t", ""),
        "hp": hp,
        "min_hp": min_hp,
        "max_hp": max_hp,
        "attack": attack,
        "min_attack": min_attack,
        "max_attack": max_attack,
        "defense": defense,
        "min_defense": min_defense,
        "max_defense": max_defense,
        "sp_atk": sp_atk,
        "min_sp_atk": min_sp_atk,
        "max_sp_atk": max_sp_atk,
        "sp_def": sp_def,
        "min_sp_def": min_sp_def,
        "max_sp_def": max_sp_def,
        "speed": speed,
        "min_speed": min_speed,
        "max_speed": max_speed,
        "total": total,
        "evo_path": evo_path,
        #"moves": moves,
        "pokemon_img_url": pokemon_img_url
    }
    print(f"Scraped data for: {name}")

print("\nData scraping completed successfully.")

Scraped data for: Bulbasaur
Scraped data for: Ivysaur
Scraped data for: Venusaur
Scraped data for: Venusaur
Scraped data for: Charmander
Scraped data for: Charmeleon
Scraped data for: Charizard
Scraped data for: Charizard
Scraped data for: Charizard
Scraped data for: Squirtle
Scraped data for: Wartortle
Scraped data for: Blastoise
Scraped data for: Blastoise
Scraped data for: Caterpie
Scraped data for: Metapod
Scraped data for: Butterfree
Scraped data for: Weedle
Scraped data for: Kakuna
Scraped data for: Beedrill
Scraped data for: Beedrill
Scraped data for: Pidgey
Scraped data for: Pidgeotto
Scraped data for: Pidgeot
Scraped data for: Pidgeot
Scraped data for: Rattata
Scraped data for: Rattata
Scraped data for: Raticate
Scraped data for: Raticate
Scraped data for: Spearow
Scraped data for: Fearow
Scraped data for: Ekans
Scraped data for: Arbok
Scraped data for: Pikachu
Scraped data for: Pikachu
Scraped data for: Raichu
Scraped data for: Raichu
Scraped data for: Sandshrew
Scraped data 

## Print First 3 Pokémon Data to Verify

In [38]:
print("\nVerifying first 3 Pokemons in JSON file...\n")
index = 0
for pokemon in list(pokemon_data.keys())[:3]:
    index += 1
    print(pokemon_data[pokemon]['name'], ":", pokemon_data[pokemon], "\n")


Verifying first 3 Pokemons in JSON file...

bulbasaur : {'name': 'bulbasaur', 'national_no': '0001', 'types': ['grass', 'poison'], 'species': 'Seed Pokémon', 'height': '0.7 m (2′04″)', 'weight': '6.9 kg (15.2 lbs)', 'abilities': 'Overgrow, Chlorophyll', 'ev_yield': '1 Sp. Atk', 'catch_rate': '45 (5.9% with PokéBall, full HP)', 'base_friendship': '50 (normal)', 'base_exp': '64', 'growth_rate': 'Medium Slow', 'egg_groups': 'Grass, Monster', 'gender': '87.5% male, 12.5% female', 'egg_cycles': '20(4,884–5,140 steps)', 'hp': '45', 'min_hp': '200', 'max_hp': '294', 'attack': '49', 'min_attack': '92', 'max_attack': '216', 'defense': '49', 'min_defense': '92', 'max_defense': '216', 'sp_atk': '65', 'min_sp_atk': '121', 'max_sp_atk': '251', 'sp_def': '65', 'min_sp_def': '121', 'max_sp_def': '251', 'speed': '45', 'min_speed': '85', 'max_speed': '207', 'total': '318', 'evo_path': ['Bulbasaur', 'Ivysaur', 'Venusaur'], 'pokemon_img_url': 'https://img.pokemondb.net/artwork/bulbasaur.jpg'} 

ivysaur 

# Save Data in JSON Format

In [37]:
file_name = "pokemon_data.json"
with open(file_name, 'w') as json_file:
    json.dump(pokemon_data, json_file, indent=4)

print('\n', file_name, 'file was successfully saved')


 pokemon_data.json file was successfully saved
