In [4]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

### Use the following resource

https://www.zeldadungeon.net/wiki/Tears_of_the_Kingdom_Materials

----

- Get a list of all materials
    - include href links that point to additional pages

In [6]:
### each of the materials groups has a header indicated by the following: 
# -> span class="mw-headline"
# 
# Under this header, there are lists of multiple 
# 
totk_materials_url = 'https://www.zeldadungeon.net/wiki/Tears_of_the_Kingdom_Materials'
r = requests.get(totk_materials_url)
r.status_code

200

In [176]:
# find html class mw-headline in html document soup
soup = BeautifulSoup(r.text)

In [203]:
# find Contents Index
toc_list = soup.find_all('div', {"class":"toc", "id":"toc"})
# find all material headers
hdr_list = soup.find_all('h2')
# find all materials under each header
span_list = soup.find_all('span', {"class":"mw-headline"})
# get all links for each material
dl_list = soup.find_all('dl')

In [227]:
def extract_material_hierarchy(toc_list):
    ''' return material group dictionary containing materials that fall under each group '''
    toc_ul_list = toc_list[0].find_all('ul')
    full_toc = toc_ul_list[0]
    group_hdr_list = full_toc.find_all('li', {"class":"toclevel-1"})
    len(group_hdr_list)


    ### for each materials group, find all materials that align with this group
    toclevel1_pat = re.compile("toclevel-1*")
    toclevel2_pat = re.compile("toclevel-2*")
    material_group_dict = {}
    for g in group_hdr_list:
        material_group = g.contents[0].text
        print(material_group)
        group_vals_list = g.find_all('li', {"class":toclevel2_pat})
        material_list = []
        for gv in group_vals_list:
            try:
                g_a_val = gv.find("a")
                material_name = g_a_val.find_all('span')[1].text
                material_list.append(material_name)
                # print('\t'+material_name)
            except:
                print('Could not parse the above')
        if pd.isnull(material_group)==False:
            material_group_dict.update({material_group:material_list})
    return material_group_dict

In [232]:
def get_href_by_material(dl_list):
        # get title for each material
    dl_raw_title = [d.find('a').get('title') for d in dl_list if d.find('a') is not None]
        # get URL for each material
    dl_raw_links = [d.find('a').get('href') for d in dl_list if d.find('a') is not None]
        # create absolute URL reference
    base_url = 'https://www.zeldadungeon.net'
    material_proper_link = [base_url+dl for dl in dl_raw_links]
        # 
    link_zip = list(zip(dl_raw_title, material_proper_link))
    link_df = pd.DataFrame(link_zip, columns=['Material Name','URL'])
    return link_df 

In [253]:
### retrieve material groups and names
material_group_dict = extract_material_hierarchy(toc_list)
    # turn into dataframe
mat_final_list = []
_ = [mat_final_list.extend(list(zip([k]*len(v), v))) for k, v in material_group_dict.items()]
material_df = pd.DataFrame(mat_final_list, columns=['Material Group','Material Name'])
    # adjust group name | remove rank/order prefix
rem_order_str = lambda s: ' '.join(s.split()[1:])
material_df['Material Group'] = material_df['Material Group'].apply(rem_order_str)

### retrieve href for each material to gather more info
material_link_df = get_href_by_material(dl_list)
rem_shard = lambda s: s.replace('Shard of','').strip()
    # remove "Shard of" for the following materials
special_horn_list = ['Shard of Dinraal\'s Horn',\
                    'Shard of Naydra\'s Horn',\
                    'Shard of Farosh\'s Horn']
special_horn_inds = material_link_df['Material Name'].isin(special_horn_list)
horn_adj = material_link_df.loc[special_horn_inds, 'Material Name'].apply(rem_shard)
material_link_df.loc[special_horn_inds, 'Material Name'] = horn_adj

### JOIN tables together for full list
material_df_aug = material_df.merge(material_link_df, how='left', on='Material Name')
material_df_aug.info()
try:
    assert material_df_aug[material_df_aug['URL'].isnull()].shape[0]
except:
    print('At least one of the materials does not have a proper URL')

1 Fruits
2 Mushrooms
3 Plants
4 Meats
5 Other Ingredients
6 Fishes
7 Snails & Crabs
8 Fairy
9 Insects
10 Frogs & Lizards
11 Ores
12 Guts
13 Chuchu Jellies & Octo Balloon
14 Eyeballs
15 Wings
16 Horns
17 Blunt Enemy Drops
18 Tails & Tentacles
19 Teeth, Claws, & Fangs
20 Dragon Parts
21 Miscellaneous
<class 'pandas.core.frame.DataFrame'>
Int64Index: 251 entries, 0 to 250
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Material Group  251 non-null    object
 1   Material Name   251 non-null    object
 2   URL             251 non-null    object
dtypes: object(3)
memory usage: 7.8+ KB
At least one of the materials does not have a proper URL


In [255]:
material_df_aug.to_csv('Material Names and URLs.csv', index=False)