In [1]:
import pandas as pd       # dataframe library

# Standard website libraries
import advertools as a    # parse sitemaps
import bs4                # html parsing
import requests           # python http requests

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Generate List of URLs to scrape
The website's sitemap provides a list of all URLs. Each URL is checked to see if it includes "/trail/" in the FQDN, and if so, it is added to the url list to be scraped later.

In [2]:
# Create a basic list and dataframe for holding the scraped web data
urls = list()
df = pd.DataFrame(columns=['url', 'Name', 'Difficulty', 'Distance', 'Type', 'High Elev', 'Low Elev', 'Elev Gained', 
                           'Elev Lost', 'Avg Grade', 'Max Grade'])

In [27]:
temp_df = pd.DataFrame()

# Iterate through each zipped sitemap
for i in range(0, 100):
    site_url = 'https://www.hikingproject.com/sitemap' + str(i) + '.xml.gz'
    temp_df = a.sitemap_to_df(site_url)
    temp_df = temp_df[temp_df["loc"].str.contains('\/trail\/')]["loc"]

    # only add the url to the list, not the index number from the dataframe
    for label, content in temp_df.items():
        urls.append(content)

2021-07-25 03:03:46,320 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap0.xml.gz
2021-07-25 03:03:46,471 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap1.xml.gz
2021-07-25 03:03:46,619 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap2.xml.gz
2021-07-25 03:03:46,804 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap3.xml.gz
2021-07-25 03:03:46,960 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap4.xml.gz
2021-07-25 03:03:47,116 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap5.xml.gz
2021-07-25 03:03:47,295 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap6.xml.gz
2021-07-25 03:03:47,443 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap7.xml.gz
2021-07-25 03:03:47,592 | INFO |

2021-07-25 03:03:57,369 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap68.xml.gz
2021-07-25 03:03:57,520 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap69.xml.gz
2021-07-25 03:03:57,670 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap70.xml.gz
2021-07-25 03:03:57,852 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap71.xml.gz
2021-07-25 03:03:58,001 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap72.xml.gz
2021-07-25 03:03:58,155 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap73.xml.gz
2021-07-25 03:03:58,333 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap74.xml.gz
2021-07-25 03:03:58,485 | INFO | sitemaps.py:419 | sitemap_to_df | Getting https://www.hikingproject.com/sitemap75.xml.gz
2021-07-25 03:03:58,635 

In [3]:
# Display urls to verify
urls, len(urls)

([], 0)

# Scrape each URL for trail stats
Iterate through the URL list, scraping the trail stats and storing them in a dictionary, before appending them to the trail stats dataframe.

In [30]:
for url in urls:
    page = requests.get(site_url)
    soup = bs4.BeautifulSoup(page.content, 'html.parser')

    # checks to make sure page exists
    if page.status_code == 200:
        trail_name = soup.find(id='trail-title').text.strip()
        difficulty = soup.find(class_='difficulty-text').text.strip()

        stats = soup.find(id='trail-stats-bar')
        stat_block = stats.find('div', class_='stat-block ml-2 mr-1 mt-1').text.strip().split('\n\n\n')
        dist = stat_block[0]
        trail_type = stat_block[2]
        stat_block2 = stats.find('div', class_='stat-block mx-1 mt-1').text.strip().split('\n\n')
        elev_high = (stat_block2[0]+stat_block2[2]).strip()
        elev_low = (stat_block2[3]+stat_block2[5]).strip()
        stat_block3 = stats.find('div', class_='stat-block mx-1 mt-1').nextSibling.nextSibling.text.strip().split('\n\n')
        elev_gain = (stat_block3[0]+stat_block2[2]).strip()
        elev_lost = (stat_block3[3]+stat_block2[5]).strip()
        stat_block4 = stats.find('div', class_='stat-block ml-1 mt-1').text.strip().split('\n')
        avg_grade = stat_block4[1].strip().replace("(", "").replace(")", "")
        max_grade = stat_block4[4].strip().replace("(", "").replace(")", "")

        # create dictionary with trail stats and return
        trail_stats = {'url': site_url, 'Name': trail_name, 'Difficulty': difficulty, 'Distance': dist, 'Type': trail_type, 
                       'High Elev': elev_high, 'Low Elev': elev_low, 'Elev Gained': elev_gain, 'Elev Lost': elev_lost, 
                       'Avg Grade': avg_grade, 'Max Grade': max_grade}
        
        
        df = df.append(trail_stats, ignore_index=True)

In [4]:
# Display the scraped trail data
df

Unnamed: 0,url,Name,Difficulty,Distance,Type,High Elev,Low Elev,Elev Gained,Elev Lost,Avg Grade,Max Grade


# Export trail data to a CSV file
Export the raw trail data to a local csvfile.

In [33]:
df.to_csv('data/raw_trail_data.csv', index=False)