In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
from operator import itemgetter

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("info")

def get_beautiful_soup_of_url(url):
    opener = urllib2.build_opener()
    opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
    loc_doc = opener.open(url)
    soup = BeautifulSoup(loc_doc, 'html.parser')
    return soup

def get_area_link(main_area):
    return str(main_area).split("\n")[4].split(" ")[3].replace('href="','').replace('"','')

def get_area_name(main_area):
    return get_area_link(main_area).split("/")[-1].replace('"','')

def get_area_dict(main_area):
    return {"name":get_area_name(main_area), "link":get_area_link(main_area)}

def return_area_routes(area_list):
    full_route_list = []
    for area in area_list:
        url_soup = get_beautiful_soup_of_url(area)
        full_route_list.append(url_soup.find("div", {"id": "route-guide"}))
    return full_route_list

DEBUG:root:info


In [26]:
def get_sub_area_link(left_nav_data):
    return str(left_nav_data).split("\n")[1].split('href="')[1].split('"')[0]

def get_sub_area_name(left_nav_data):
    return get_sub_area_link(left_nav_data).split("/")[-1]

def get_sub_area_dict(left_nav_data):
    return get_sub_area_link(left_nav_data)

def get_sub_area_data(sub_area_data):
    sub_area_soup = get_beautiful_soup_of_url(sub_area_data)
    sub_area_data = [get_sub_area_dict(left_nav_data) for left_nav_data in sub_area_soup.find_all("div", {"class": "lef-nav-row"})]
    return sub_area_data

def return_sub_area_links(area_data):
    temp_list = get_sub_area_data(area_data)
    all_data = []
    for area in temp_list:
        all_data+=return_sub_area_links(area)
    return all_data+temp_list

In [36]:
austin_urls = ["https://www.mountainproject.com/area/105837312/reimers-ranch","https://www.mountainproject.com/area/107788017/reimers-ranch-north-shore","https://www.mountainproject.com/area/105905087/barton-creek-greenbelt"]
frankenjura_urls = ["https://www.mountainproject.com/area/106172652/frankenjura"]

In [37]:
def build_export_url(rock_type, area_id):
    base_url = 'https://www.mountainproject.com/route-finder-export'
    route_select_url = '?selectedIds='+str(area_id)
    type_url = '&type='+str(rock_type)
    setting_url = '&diffMinrock=800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=50000&diffMaxrock=12400&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_sport_climb=1&stars=0&pitches=0&sort1=popularity+desc&sort2=rating'
    return base_url+route_select_url+type_url+setting_url

def get_export_url_from_route_finder(area_id):
    rock_type = ['rock']
    for rock in rock_type:
        try:
            export_url = build_export_url(rock, area_id)
            url_request = urllib2.urlopen(export_url)
            route_data = url_request.readlines()
        except:
            route_data = []
    return route_data

def extract_area_id(sub_area_data):
    return sub_area_data.split('/')[4]

def extract_dataframe_from_area(area_list):
    columns = ['Route','Location','URL','"Avg Stars"','"Your Stars"','Rating','Pitches','Length']
    master_route_df = pd.DataFrame(columns=columns)
    master_route_data = []
    test_area = [extract_area_id(area) for area in area_list]
    for area in test_area:
        route_data = get_export_url_from_route_finder(area)
        temp_data = [test.split(',')[0:5] + test.split(',')[-3:] for test in route_data[1:]]
        route_data_df = pd.DataFrame(temp_data, columns=columns)
        master_route_df = master_route_df.append(route_data_df, ignore_index=True)
    return master_route_df

In [40]:
austin_route_df = extract_dataframe_from_area(austin_urls)
frankenjura_route_df = extract_dataframe_from_area(frankenjura_urls)

In [42]:
#austin_route_df.to_csv('austin_rd.csv')

In [43]:
#frankenjura_route_df.to_csv('frankenjura_rd.csv')

In [None]:
# Follow Up Steps
# 1) Scrape text data from URLS available
# 2) Extract Text Features from Page
# 3) Extract General Weather data from Area
# 4) Geographical data from area
# 5) Merge into Master Dataframe
# 6) Outcome - Star Rating of Route
# 7) Move onto Modeling

In [54]:
def return_route_soup(route_url):
    try:
        route_soup = get_beautiful_soup_of_url(route_url)
    except:
        route_soup = ''
    return route_soup

def return_page_views(route_soup):
    try:
        route_details = route_soup.find_all("table", {"class": "description-details"})
        return str(route_details[0]).split("Page Views:")[1].split("</td>\n<td>\n")[1].split("total")[0].strip()
    except:
        return ''

def return_text(route_soup):
    try:
        route_details = route_soup.find_all("div", {"class": "fr-view"})
        return "".join([str(details).replace("<br>","").replace("</br>","").replace('<div class="fr-view">',"").replace("</div>","") for details in route_details])
    except:
        return ''
    
def number_of_photos(route_soup):
    try:
        route_details = route_soup.find_all("div", {"class": "col-xs-4 col-lg-3 card-with-photo"})
        return len(route_details)
    except:
        return ''

def number_of_comments(route_soup):
    try:
        route_details = route_details = route_soup.find_all("table", {"class": "main-comment width100"})
        return len(route_details)
    except:
        return ''

def return_comment_text(route_soup):
    try:
        route_details = route_soup.find_all("div", {"class": "comment-body max-height max-height-md-300 max-height-xs-150"})
        return "".join([str(details) \
             .replace('<div class="comment-body max-height max-height-md-300 max-height-xs-150">\n',"")\
             .replace("<br>","") \
             .replace("</br>","") \
             .replace('<span class="comment-time">\n',"") \
             .replace('</span>\n</div>',"") \
             .replace('\n                                    ',' ') \
             .strip() for details in route_details])
    except:
        return ''

In [49]:
len(frankenjura_route_df)

1000

In [55]:
austin_route_df['route_soup'] = austin_route_df['URL'].apply(return_route_soup)

frankenjura_route_df['route_soup'] = frankenjura_route_df['URL'].apply(return_route_soup)

In [57]:
austin_route_df['page_text'] = austin_route_df['route_soup'].apply(return_text)
austin_route_df['comment_text'] = austin_route_df['route_soup'].apply(return_comment_text)

frankenjura_route_df['page_text'] = frankenjura_route_df['route_soup'].apply(return_text)
frankenjura_route_df['comment_text'] = frankenjura_route_df['route_soup'].apply(return_comment_text)

In [59]:
austin_route_df.to_csv('austin_route_text.csv')
frankenjura_route_df.to_csv('frankenjura_route_text.csv')