# Clean Downloaded BeerAdvocate Pages

## Cache pages from www.beeradvocate.com locally for further scraping

By: Mike Beaumier -- Fellow, Insight Data Science

 - Email: [michael.beaumier@gmail.com](mailto:michael.beaumier@gmail.com)
 - Twitter: [@jollyhrothgar](https://twitter.com/jollyhrothgar)
 - **LinkedIn**: [Add me w/ 1 line message about our connection](https://www.linkedin.com/in/michaelbeaumier)

Repositories on [github](https://www.github.com/Jollyhrothgar)

This notebook is designed to clean the downloaded beer data, and generate a database + csv file.


In [7]:
import os
import re 
import sys
import urllib2
import pandas
import csv
import numpy as np
import matplotlib.pyplot as plt
from time import sleep
from bs4 import BeautifulSoup

In [8]:
def mine_file(file_name):
    '''
    Takes file_name and mines for tables containing data about beers.
    Note that the unique brewery ID must be part of the brewery's file name. 
    This step is accomplished during scraping.
    '''
    ## Extract Brewry ID
    brewery_key = -1
    matches = re.search('brewery_(\d+).html',file_name)
    if matches:
        brewery_key = matches.group(1)
    
    if brewery_key < 0:
        print "Problem extracting brewery key. Bailing out!"
        return
    f = open(file_name,'rU')
    soup = BeautifulSoup(f.read().decode('utf-8'),'lxml')
    unicode_page = soup.prettify()
    #debug
    #print 'found ',len(soup.findAll('table')),'tables.'
    
    ### Overview Information ###
    brewery_dict = {}
    
    title = soup.title.string
    title_list = title.split('|')
    brewery_name = title_list[0].strip()
    city_state_list = title_list[1].split(',')
    brewery_city = city_state_list[0].strip()
    brewery_state = city_state_list[1].strip()
    
    brewery_dict["brewery_key"] = brewery_key
    brewery_dict["brewery_name"] = brewery_name.encode('utf_8')
    brewery_dict["city"] = brewery_city.encode('utf_8')
    brewery_dict["state"] = brewery_state.encode('utf_8')
    
    tables = soup.findAll('table')
    overview_table = tables[0]
    beer_table = tables[-1]
    
    ### Get Phone Number ###
    for row in overview_table.findAll('tr'):
        #print ">",row.text
        matches = re.search('phone: (\((\d{3})\) (\d{3})-(\d{4}))',row.text)
        if matches:
            if 'brewery_phone' not in brewery_dict:
                brewery_dict['phone'] = matches.group(1).encode('utf_8')
                brewery_dict['phone_key'] = str(matches.group(2)+matches.group(3)+matches.group(4)).encode('utf_8')

    ### Getting the beer ratings, styles, etc ###
    beers = []
    beer_dicts = []
    for row in beer_table.findAll('tr'):
        beer = []
        #beer_key contains beer style key and beer name key encoded in links.
        beer_key = row.findAll('a')
        for col in row.findAll('td'):
            value = col.text
            beer.append(value)
        beers.append(beer)
        if len(beer) == 6 and 'Style' not in beer:
            #print ">>>",beer_key
            # extract beer name key
            key_matches = re.search('/beer/profile/(\d+)/(\d+)',beer_key[0]['href'])
            
            # extract beer style key
            style_matches = re.search('/beer/style/(\d+)',beer_key[1]['href'])            
            beer_key = -1
            beer_style_key = -1
            if key_matches:
                if brewery_dict['brewery_key'] != key_matches.group(1):
                    print "Brewery key mismatch, bailing out!"
                    print "Good Key 1:", brewery_dict['brewery_key']
                    print " Bad Key 2:", key_matches.group(1)
                    return
                beer_key = key_matches.group(2)
            if style_matches:
                beer_style_key = style_matches.group(1)
            
            ## Now we can fill our beer info!
            beer_dict = {}
            beer_dict["beer_name"] = beer[0].encode('utf_8')
            beer_dict["style_name"] = beer[1].encode('utf_8')
            beer_dict["style_key"] = beer_style_key
            beer_dict["beer_name_key"] = beer_key
            beer_dict["abv"] = beer[2].encode('utf_8')
            beer_dict["avg_score"] = beer[3].encode('utf_8')
            rating = beer[4].encode('utf_8')
            clean_rating = ''.join(e for e in rating if e.isdigit() or e == '.')
            beer_dict["ratings_count"] = clean_rating
            beer_dict["bros_score"] = beer[5].encode('utf_8')
            beer_dicts.append(beer_dict)
    # debug
    # print "Brewery Summary: ",brewery_dict['brewery_name']
    
    # debug
    #for k,v in brewery_dict.iteritems():
    #    print k,v
    
    return_dict_list = []
    
    for beer in beer_dicts:
        one_beer_dict = dict(brewery_dict)
        for k,v in beer.iteritems():
            if k not in one_beer_dict:
                one_beer_dict[k] = v
        return_dict_list.append(one_beer_dict)
    
    print "Processed:",len(return_dict_list),"beers. Brewery key:",brewery_dict["brewery_key"]
    f.close()
    return return_dict_list

In [10]:
mine_file('./current_data/11580/brewery_11580.html')

Processed: 19 beers. Brewery key: 11580


[{'abv': '5.40',
  'avg_score': '2.78',
  'beer_name': 'Amber Ale',
  'beer_name_key': '27980',
  'brewery_key': '11580',
  'brewery_name': 'Santa Cruz Mountain Brewing',
  'bros_score': '-',
  'city': 'Santa Cruz',
  'phone': '(831) 425-4900',
  'phone_key': '8314254900',
  'ratings_count': '21',
  'state': 'CA',
  'style_key': '128',
  'style_name': 'American Amber / Red Ale'},
 {'abv': '7.10',
  'avg_score': '3.88',
  'beer_name': 'Black IPA',
  'beer_name_key': '61868',
  'brewery_key': '11580',
  'brewery_name': 'Santa Cruz Mountain Brewing',
  'bros_score': '-',
  'city': 'Santa Cruz',
  'phone': '(831) 425-4900',
  'phone_key': '8314254900',
  'ratings_count': '4',
  'state': 'CA',
  'style_key': '175',
  'style_name': 'American Black Ale'},
 {'abv': '6.00',
  'avg_score': '3.46',
  'beer_name': 'Breakfast Stout',
  'beer_name_key': '95870',
  'brewery_key': '11580',
  'brewery_name': 'Santa Cruz Mountain Brewing',
  'bros_score': '-',
  'city': 'Santa Cruz',
  'phone': '(831) 4

In [11]:
def load_brewery_file_list(filelist):
    '''
    Given a list of newline separated file names, read each file name and store in an array.
    Return this array.
    '''
    f = open(filelist,'rU')
    files = []
    for file in f:
        s_file = file.rstrip('\n\r')
        files.append(s_file)
    print "loaded",len(files),"breweries"
    return files

In [16]:
def main():
    print "called main!"
    file_list = load_brewery_file_list('./lists/local_brewery_list.txt')
    beer_data_list = []
    for beer_file in file_list:
        beers = mine_file(beer_file)
        if beers:
            for beer in beers:
                beer_data_list.append(beer)
    keys = beer_data_list[0].keys()
    with open('./clean_data_csv/brewery_information.csv','wb') as output_file:
        dict_writer = csv.DictWriter(output_file,keys)
        dict_writer.writeheader()
        dict_writer.writerows(beer_data_list)
        output_file.close()
        
    print "all finished!"
    return

In [17]:
if __name__=='__main__':
    print "Boilerplate call to main"
    main()

Boilerplate call to main
called main!
loaded 595 breweries
Processed: 0 beers. Brewery key: 42860
Processed: 37 beers. Brewery key: 10046
Processed: 13 beers. Brewery key: 10087
Processed: 35 beers. Brewery key: 1011
Processed: 5 beers. Brewery key: 1012
Processed: 7 beers. Brewery key: 1015
Processed: 12 beers. Brewery key: 1016
Processed: 11 beers. Brewery key: 1018
Processed: 37 beers. Brewery key: 10583
Processed: 4 beers. Brewery key: 10626
Processed: 21 beers. Brewery key: 10707
Processed: 3 beers. Brewery key: 11017
Processed: 3 beers. Brewery key: 11084
Processed: 25 beers. Brewery key: 112
Processed: 11 beers. Brewery key: 11342
Processed: 20 beers. Brewery key: 11466
Processed: 5 beers. Brewery key: 11474
Processed: 19 beers. Brewery key: 11580
Processed: 2 beers. Brewery key: 1196
Processed: 16 beers. Brewery key: 12164
Processed: 17 beers. Brewery key: 12351
Processed: 19 beers. Brewery key: 12402
Processed: 19 beers. Brewery key: 12788
Processed: 36 beers. Brewery key: 130