This notebook gets the Untappd URL for each brewery in brews/breweries_final.csv.

In [270]:
import pandas as pd
from time import sleep

import requests
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

### Import brewery database

In [198]:
dfBrewery = pd.read_csv('brews/breweries_final.csv')

In [199]:
dfBrewery.head()

Unnamed: 0,brewID,brewery,address,city,state,neighborhood,website,features,coord,coord2,inChicago,latitude,longitude,hasTapRoom,hasTour,hasKitchen
0,1hd3,18th Street Brewery – Hammond,5417 N. Oakley Ave.,Hammond,IN,,http://www.18thstreetbrewery.com/,kitchen & beer,"-87.517525,41.6156",98416802.0,no,41.6156,-87.517525,no,no,yes
1,1hy5,18th Street Brewery – Gary,5725 Miller Ave.,Gary,IN,,http://www.18thstreetbrewery.com/,kitchen & beer,"-87.26886,41.599365",98428122.0,no,41.599365,-87.26886,no,no,yes
2,2W.3,25 West Brewing Co.,2 Stratford Dr.,Bloomingdale,IL,,https://www.facebook.com/25WestBrew/,,"-88.11923,41.950455",112710728.0,no,41.950455,-88.11923,no,no,no
3,3 .2,350 Brewing Co.,7144 W. 183rd St.,Tinley Park,IL,,https://350brewing.com,kitchen & beer,"-87.791595,41.55857",638598086.0,no,41.55857,-87.791595,no,no,yes
4,3l.6,3 Floyds Brewing Co.,9750 Indiana Pkwy.,Munster,IN,,http://www.3floyds.com/,"kitchen & beer, tour","-87.515755,41.536514",98443594.0,no,41.536514,-87.515755,no,yes,yes


In [200]:
dfBrewery['brewery']

0           18th Street Brewery – Hammond
1              18th Street Brewery – Gary
2                     25 West Brewing Co.
3                         350 Brewing Co.
4                    3 Floyds Brewing Co.
5                     5 Rabbit Cervecería
6                       51st Ward Brewing
7                    9th Hour Brewing Co.
8                    All Rise Brewing Co.
9                Afterthought Brewing Co.
10                      Alter Brewing Co.
11                  Andersonville Brewing
12                    Ancient Owl Brewing
13                          ALULU Brewpub
14                                 Aleman
15             Arrowhead Ales Brewing Co.
16                       Alarmist Brewing
17               Around the Bend Beer Co.
18                          Argus Brewery
19                      Byway Brewing Co.
20                    Bosacki's Home Brew
21                BBGB Brewery & Hop Farm
22              Black Horizon Brewing Co.
23                        Band of 

### Scraping Untappd

In [7]:
def simpleGet(url):
    """Attempts to get the content at 'url' by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None"""
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if isGoodResponse(resp):
                return resp.content
            else:
                return None
    
    except RequestException as e:
        logError('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
    
def isGoodResponse(resp):
    """Returns True if the response seems to be HTML"""
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)

def logError(e):
    """Prints error"""
    print(e)

In [8]:
def makeSoup(url):
    """Use simple_get function to make soup from url"""
    try:
        return BeautifulSoup(simpleGet(url), 'html.parser')
    except TypeError as e:
        logError('Non-HTML content at {0} : {1}'.format(url, str(e)))
        return None

Make functions that takes us from brewery name to brewery search page to brewery page.

In [251]:
#Make a dict to treat exceptional cases.
#Breweries not yet on Untappd are mapped to '',
#and others are breweries that need alternative search terms.
nameDict = {'Aleman': 'Aleman Brewing',
            'Bixi Brewery': '',
            'Crown Brewing': 'Crown Brewing Company',
            'Englewood Brews': '',
            'FIBS Brewing': '',
            'Lake Effect Brewing Co.': 'Lake Effect Brewing',
            'McHenry Brewing Company': "Chain O'Lakes Brewing Company",
            'Oak Park Brewing Co.': 'Oak Park Brewing Company (IL)',
            'Strike Ten Brewing Co.': '',
            'Scallywag Brewing': '',
            'Whiner Brewery': ''
            } 

In [278]:
test = '–'

In [280]:
test.encode()

b'\xe2\x80\x93'

In [290]:
str(b'\xe2\x80\x93')

"b'\\xe2\\x80\\x93'"

In [192]:
def searchURL(name):
    """Returns the URL to search Untappd for brewery"""
    #remove parts after '–', replace ' ' by '+'
    nameParsed = name.split(sep='–')[0].strip().replace(' ', '+')
    return 'https://untappd.com/search?q=' + nameParsed + '&type=brewery'

def getBreweryURL(name):
    """Returns the Untappd brewery page URL from name"""
    if name == '': #recursion did not work
        return None
    elif name in nameDict: #convert exceptional cases
        name = nameDict[name]
        
    try:
        url = searchURL(name)
        soup = makeSoup(url)
        #get the result part of the HTML
        result = soup.find("div", {"class": "results-container"})
        #go to the top brewery in the search
        nameTag = result.find("p", {"class": "name"})
        #get the href
        href = name.find("a")['href']
        return 'https://untappd.com' + href
    except AttributeError: #no search result
        #try without the last word (recursion)
        shorterName = ' '.join(name.split()[:-1])
        return getBreweryURL(shorterName)

I need to log in. Define logged in versions of the above functions.

In [230]:
def getSessionKey(session):
    """Get session key from the Untappd login page."""
    login = session.get('http://untappd.com/login')
    soup  = BeautifulSoup(login.content, 'html.parser')
    form  = soup.find('form')
    return form.find('input', attrs={'name': 'session_key'})['value']

In [193]:
def simpleGetL(url, session):
    """Attempts to get the content at 'url' by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None"""
    try:
        with closing(session.get(url, stream=True)) as resp:
            if isGoodResponse(resp):
                return resp.content
            else:
                return None
    
    except RequestException as e:
        logError('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
    
def makeSoupL(url, session):
    """Use simple_get function to make soup from url"""
    try:
        return BeautifulSoup(simpleGetL(url, session), 'html.parser')
    except TypeError as e:
        logError('Non-HTML content at {0} : {1}'.format(url, str(e)))
        return None

def getBreweryURLL(name, session):
    """Returns the Untappd brewery page URL from name"""
    if name == '': #recursion did not work
        return None
    elif name in nameDict: #convert exceptional cases
        name = nameDict[name]
        
    try:
        url = searchURL(name)
        soup = makeSoupL(url, session)
        #get the result part of the HTML
        result = soup.find("div", {"class": "results-container"})
        #go to the top brewery in the search
        nameTag = result.find("p", {"class": "name"})
        #get the href
        href = nameTag.find("a")['href']
        return 'https://untappd.com' + href
    except AttributeError: #no search result
        #try without the last word (recursion)
        shorterName = ' '.join(name.split()[:-1])
        return getBreweryURLL(shorterName, session)

In [82]:
import ast

with open('/Users/satoru/untappdLogin.txt', 'r') as f:
    untappdDict = ast.literal_eval(f.read())

In [248]:
untappdURLDict = {}

with requests.session() as s:
    untappdDict['session_key'] = getSessionKey(s)
    p = s.post('https://untappd.com/login/', data=untappdDict)
    for name in dfBrewery['brewery']:
        sleep(5.)
        untappdURL = getBreweryURLL(name, s)
        print(name, untappdURL)
        untappdURLDict[name] = untappdURL

18th Street Brewery – Hammond https://untappd.com/18thStreetBrewery
18th Street Brewery – Gary https://untappd.com/18thStreetBrewery
25 West Brewing Co. https://untappd.com/w/25-west-brewing-co/391402
350 Brewing Co. https://untappd.com/350Brewing
3 Floyds Brewing Co. https://untappd.com/3floyds
5 Rabbit Cervecería https://untappd.com/5Rabbit
51st Ward Brewing https://untappd.com/51stWardBeer
9th Hour Brewing Co. https://untappd.com/9thHourBrewing
All Rise Brewing Co. https://untappd.com/AllRiseBrewing
Afterthought Brewing Co. https://untappd.com/AfterthoughtBrewing
Alter Brewing Co. https://untappd.com/AlterBrewing_Company
Andersonville Brewing https://untappd.com/AndersonvilleBrewing
Ancient Owl Brewing https://untappd.com/AncientOwlBrewing
ALULU Brewpub https://untappd.com/ALULUBrewPub
Aleman https://untappd.com/Aleman
Arrowhead Ales Brewing Co. https://untappd.com/ArrowheadAles
Alarmist Brewing https://untappd.com/AlarmistBrewing
Around the Bend Beer Co. https://untappd.com/ATB
Arg

Rock Bottom – Lombard https://untappd.com/RockBottom
Rock Bottom – Warrenville https://untappd.com/RockBottom
Rock Bottom – Bolingbrook https://untappd.com/RockBottom
Rock Bottom – Orland Park https://untappd.com/RockBottom
Rock Bottom Restaurant & Brewery https://untappd.com/RockBottom
Revolution Brewing – Brewpub https://untappd.com/RevolutionBrewingChicago
Revolution Brewing – Taproom https://untappd.com/RevolutionBrewingChicago
Roaring Table Brewing https://untappd.com/RoaringTableBrewing
Stockholm's Brewpub https://untappd.com/w/stockholms/12082
Side Lot Brewery https://untappd.com/SideLotBrewery
Solemn Oath Brewery https://untappd.com/solemnoathbeer
Strike Ten Brewing Co. None
Scallywag Brewing None
Sullers Past Brewing https://untappd.com/SullersPast
Smylie Bros. Brewing https://untappd.com/SmylieBrothersBrewingCompany
Small Town Brewery https://untappd.com/SmallTownBrewery
Skeleton Key Brewery https://untappd.com/SkeletonKeyBrewery
Saint John Malt Brothers Brewing https://untap

In [265]:
dfBrewery['untappdURL'] = dfBrewery['brewery'].apply(lambda x: untappdURLDict[x])

In [272]:
dfBrewery.loc[dfBrewery['untappdURL'].isna(), 'untappdURL'] = ''

In [273]:
dfBrewery.to_csv('brews/breweries_final.csv')