In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
url = 'https://warcraft.wiki.gg/wiki/'

In [None]:
def get_all_subzones(url_zone):  
    response = requests.get(url+url_zone)
    page = BeautifulSoup(response.text, 'html.parser')
    table = next(x for x in page.find_all(True) if x.name=='table' and 'class' in x.attrs and 'navbox' in x.attrs['class'])
    
    # Check if the table's title corresponds with the expected pattern of 'Subzones of $zone_name'
    table_title = next(x for x in table.descendants if x.attrs and 'href' in x.attrs).attrs['href']        
    if not table_title[15:] in [url_zone, "The_"+url_zone]: #$zone_name doesn't corrspond with the actual zone's name
        return []    

    zone_or_subzone = next(x.attrs['title'] for x in table.find_all(True)
        if x.attrs and 'title' in x.attrs
        and ('Subzone' in x.attrs['title'] or 'Zone' in x.attrs['title']))    
    if zone_or_subzone == 'Zone': #The table consists of Zones rather than Subzones
        return []    

    first_gc = lambda x : x.parent.parent.contents[0].contents[0]
    undisplayed_check = lambda x : 'title' in x.attrs and 'Undisplayed' in x.attrs['title']
    
    result = [item.string
              for item in table.descendants
              if item.name == 'a' 
              and item.parent.name == 'li'
              and item.next_element.next_element == '\n'
              and not undisplayed_check(first_gc(item))]
    
    return result

In [4]:
def get_all_zones(url_continent):
        
    response = requests.get(url+url_continent)
    page = BeautifulSoup(response.text, 'html.parser')
    table = next(x for x in page.find_all(True) if x.name=='table' and 'class' in x.attrs and 'navbox' in x.attrs['class'])
            
    result = [item.attrs['href'][6:] 
              for item in table.descendants
              if item.name == 'a' 
              and item.parent.name == 'li' 
              and not 'Classic' in item.attrs['href']
              and not 'Template' in item.attrs['href']
              and not '=' in item.attrs['href']
              and not '.png' in item.attrs['href']]
    
    return result
    
    

In [None]:
continents = ['Eastern_Kingdoms', 'Kalimdor', 'Outlands', 'Northrend',
              'Pandaria', 'Draenor_(alternate_universe)', 'Broken_Isles', 'Kul_Tiras', 'Zandalar',
              'Shadowlands', 'Dragon_Isles', 'Khaz_Algar']

additional_zones = ['Nazjatar', 'Undermine', "K%27aresh"]

get_name = lambda x: x.replace("_"," ").replace("%27","'").replace("ernate universe","")

subzone_list = []
zone_list = []

def process_zone(zone):
    subzones = get_all_subzones(zone)
    if len(subzones) > 0:
        print(f"Processing {get_name(zone)} ({get_name(continent)})")        
    subzone_list += subzones
    zone_list += [get_name(zone)] * len(subzones)

for continent in continents:
    for zone in get_all_zones(continent):
        process_zone(zone)
for zone in additional_zones:
    process_zone(zone)

df = pd.DataFrame({'Subzone': subzone_list, 'Zone': zone_list})
df.to_csv('subzones.csv')

Processing Blasted Lands (Eastern Kingdoms)
Processing Burning Steppes (Eastern Kingdoms)
Processing Deadwind Pass (Eastern Kingdoms)
Processing Duskwood (Eastern Kingdoms)
Processing Elwynn Forest (Eastern Kingdoms)
Processing Stormwind City (Eastern Kingdoms)
Processing Redridge Mountains (Eastern Kingdoms)
Processing Cape of Stranglethorn (Eastern Kingdoms)
Processing Northern Stranglethorn (Eastern Kingdoms)
Processing Swamp of Sorrows (Eastern Kingdoms)
Processing Westfall (Eastern Kingdoms)
Processing Badlands (Eastern Kingdoms)
Processing Blackrock Mountain (Eastern Kingdoms)
Processing Dun Morogh (Eastern Kingdoms)
Processing Ironforge (Eastern Kingdoms)
Processing Loch Modan (Eastern Kingdoms)
Processing Searing Gorge (Eastern Kingdoms)
Processing Twilight Highlands (Eastern Kingdoms)
Processing Wetlands (Eastern Kingdoms)
Processing Arathi Highlands (Eastern Kingdoms)
Processing Hillsbrad Foothills (Eastern Kingdoms)
Processing Hinterlands (Eastern Kingdoms)
Processing Easter