# AleTrail BeerAdvocate Scraper

## Cache pages from www.beeradvocate.com locally for further scraping

By: Mike Beaumier -- Fellow, Insight Data Science

 - Email: [michael.beaumier@gmail.com](mailto:michael.beaumier@gmail.com)
 - Twitter: [@jollyhrothgar](https://twitter.com/jollyhrothgar)
 - **LinkedIn**: [Add me w/ 1 line message about our connection](https://www.linkedin.com/in/michaelbeaumier)

Repositories on [github](https://www.github.com/Jollyhrothgar)

## 1. Import Libraries

In [15]:
import os
import re 
import sys
import urllib2
import pandas
import csv
import numpy as np
import matplotlib.pyplot as plt
from time import sleep
from bs4 import BeautifulSoup

In [16]:
def scrape_brewery(url,dump_directory = '.'):
    '''
    takes a beer-advocate URL formatted like so: www.beeradvocate.com/pages/<numeric_id>/" 
    and dumps the page to a utf-8 text file. Uses <numeric_id> to uniquely identify the file.
    stores these html dumps in directory argument (defualt is '.') as: 
    '<dump_directory>/index_<numeric_id>.html'
    '''
    
    # Check if data has been downloaded
    re_check = re.search('profile/(\d+)',url)
    print re_check.group(1)
    if os.path.isdir(dump_directory+'/'+re_check.group(1)):
        print "data exists"
        return 1
    else:
        sleep(0.25)
    print "Scraping URL: ",url
    headers = { 'User-Agent' : 'Mozilla/5.0'  }
    req = urllib2.Request(url, None, headers)
    html = urllib2.urlopen(req).read()
    soup = BeautifulSoup(html,'lxml')

    # Get an appropriate output name from the url
    m = re.search(r'http://www.beeradvocate.com/beer/profile/(\d+)/',url)
    page_id = 0
    if not m:
        print 'Bad match for url:',url,'\nBailing out!'
        return
    else:
        page_id = m.group(1)
    brewery_page = soup.encode('utf-8')
    matches = re.findall('/\d+/\d+',brewery_page)
    brew_dict = {}
    for match in matches:
        id_match = re.search('/(\d+)/(\d+)',match)
        if id_match.group(1) not in brew_dict:
            brew_dict[id_match.group(1)] = []
            brew_dict[id_match.group(1)].append(id_match.group(2))
        else:
            brew_dict[id_match.group(1)].append(id_match.group(2))
    # okay, now we have a list of beer pages from which to scrape reviews, but we only
    # want the 'real' beers, not suggestions from beer-advocate, so we keep the 'larger list' of beers.
    best_key = -1
    key_length = -1
    for key in brew_dict:
        if len(brew_dict[key]) > key_length:
            best_key = key
            key_length = len(brew_dict[key])
    
    # "http://www.beeradvocate.com/beer/profile/<brewery_id>/<beer_id>/view=beer&sort=topr&start=<page_num>"
    beer_url_dict = {}
    for beer_id in brew_dict[best_key]:
        beer_url = (
            "http://www.beeradvocate.com/beer/profile/"
            +str(best_key)
            +"/"
            +str(beer_id)
            )
        if beer_url not in beer_url_dict:
            beer_url_dict[beer_url] = brew_dict[best_key]
    # Create beer review list whether or not reviews exist. If they exist, we get the top 50
    beer_review_list = []
    for k,v in beer_url_dict.iteritems():
        beer_review_list.append(k + '/view=beer&sort=top&start=0')
        beer_review_list.append(k + '/view=beer&sort=top&start=25')
    for url in beer_review_list:
        #print url
    print dump_directory+'/'+str(best_key)
    try:
        os.mkdir(dump_directory+'/'+str(best_key))
    except OSError:
        print "data exists for brewery, skipping.\n"
        return
    # dump the file
    out_file = open(dump_directory+'/'+str(best_key)+'/brewery_'+str(page_id)+'.html','w')
    out_file.write(brewery_page)
    out_file.close()
    
    for beer_url in beer_review_list:
        print "Scraping beer: ",beer_url
        matches = re.search('(\d+)/(\d+)',beer_url)
        out_beer_file = open(dump_directory+'/'+str(best_key)+'/beer_'+matches.group(2)+'.html','w')
        sleep(0.25)
        beer_req = urllib2.Request(beer_url,None,headers)
        beer_html = urllib2.urlopen(beer_req).read()
        beer_soup = BeautifulSoup(beer_html,'lxml')
        out_beer_file.write(beer_soup.encode('utf-8'))
        out_beer_file.close()
    print "Done scraping",best_key,"!"
    return

In [17]:
scrape_brewery('http://www.beeradvocate.com/beer/profile/4410/','retry')

4410
Scraping URL:  http://www.beeradvocate.com/beer/profile/4410/
http://www.beeradvocate.com/beer/profile/4410/43424/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/4410/43424/view=beer&sort=top&start=25
http://www.beeradvocate.com/beer/profile/4410/43423/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/4410/43423/view=beer&sort=top&start=25
http://www.beeradvocate.com/beer/profile/4410/51957/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/4410/51957/view=beer&sort=top&start=25
http://www.beeradvocate.com/beer/profile/4410/62401/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/4410/62401/view=beer&sort=top&start=25
http://www.beeradvocate.com/beer/profile/4410/61919/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/4410/61919/view=beer&sort=top&start=25
http://www.beeradvocate.com/beer/profile/4410/61918/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/4410/61918/view=be

IOError: [Errno 2] No such file or directory: 'retry/4410/beer_45099.html'

In [4]:
print(os.path.isdir("breweries/3120"))

True


In [5]:
def load_url_list(filename):
    '''
    takes the url list in filename, loads into an array, and then returns the array of URLS
    '''
    in_file = open(filename,'rU')
    brewery_list = in_file.read().splitlines()
    print 'Loaded',len(brewery_list),"breweries."
    return brewery_list

In [18]:
def main():
    urls = load_url_list('brewery_urls.txt')
    master_list = []
    for url in urls:
        scrape_brewery(url,'./retry')
    print "Finished scraping!"

In [None]:
# Standard boilerplate to call the main() function to begin
# the program. Execute this to run everything
if __name__ == '__main__':
    main()

Loaded 645 breweries.
31027
Scraping URL:  http://www.beeradvocate.com/beer/profile/31027/
http://www.beeradvocate.com/beer/profile/31027/90624/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/31027/90624/view=beer&sort=top&start=25
http://www.beeradvocate.com/beer/profile/31027/174882/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/31027/174882/view=beer&sort=top&start=25
http://www.beeradvocate.com/beer/profile/31027/107941/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/31027/107941/view=beer&sort=top&start=25
http://www.beeradvocate.com/beer/profile/31027/93412/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/31027/93412/view=beer&sort=top&start=25
http://www.beeradvocate.com/beer/profile/31027/93908/view=beer&sort=top&start=0
http://www.beeradvocate.com/beer/profile/31027/93908/view=beer&sort=top&start=25
./retry/31027
Scraping beer:  http://www.beeradvocate.com/beer/profile/31027/90624/view=beer&sort=to