In [1]:
# imports
import requests
import re, time
import urllib.request
import urllib.parse
from difflib import SequenceMatcher
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

In [2]:
# links used
ratebeer_Ireland_URL = "https://www.ratebeer.com/breweries/ireland/0/100/#closed"

In [38]:
#########################
## scraping tool
# get data from links
def make_soup(url_linker):
    # impose scrape padding
    time.sleep(0.5)
    req = requests.get(url_linker)
    data = req.text
    soup = BeautifulSoup(data,"html.parser")
    return soup

########################
## parsing & cleaning
# gets table data
def parse_table(URLi, size):
    print("Starting...")
    page = make_soup(URLi)
    page_table = page.find_all('table')[0]
    # data
    data = []
    count = 0
    for i, li in enumerate(page_table.select('tr')):
        if count > size:
            break
        else:
            count = count+1
        # gets other data
        txt = str(i)
        for d in li.select('td'):
            if "<br/>" in str(d):
                fresh = str(d).split("<br/>")
                brewery_name = BeautifulSoup(fresh[0],"html.parser").text
                brewery_region = re.split(" - ", BeautifulSoup(fresh[1],"html.parser").text)[0]
            else:
                txt = txt + ", " + re.sub("\s\s+"," ",d.text)
        # gets url
        for ab in li.select('a'):
            url_data = get_brewery_url(ab['href'],brewery_name)
            break
        # skip empty value
        # clean data
        if i!=0:
            # formats into array
            brewery_type = str(txt).split(", ")
            # accounts for dirt
            if "rew" not in brewery_type[0]:
                loc = find_brew_type(brewery_type)
                brewery_type = brewery_type[loc]
            # creats dict
            context = {
                'name':brewery_name, 
                'region':brewery_region,
                'type':brewery_type,
                'url':url_data['url'],
                'twitter':url_data['twitter'],
                'facebook':url_data['facebook']
            }
            # adds to data
            data.append(context)
            
    print("table data: done...")
    return data

# gets brewery url
def get_brewery_url(url_ending, brew_name):
    time.sleep(0.5)
    data = []
    url_brews = "https://www.ratebeer.com"
    work_url = url_brews+url_ending
    # clean name
    brew_name = re.sub(" ","",str(brew_name))
    brew_name = re.sub("-","",str(brew_name))
    brew_name = re.sub("Brewery","",str(brew_name))
    # get urls
    links_list = make_soup(work_url).find_all('a')
    # break url
    url_boolean = False
    twitter_boolean = False
    facebook_boolean = False
    final_boolean = False
    # loop through all links
    for link in links_list:
        # gets link
        lk = link.get('href')
        # break when all found
        if url_boolean and twitter_boolean and facebook_boolean == True:
            break
        # strips noise
        lk = re.sub("www.","",str(lk))
        lk = re.sub("https://","",str(lk))
        lk = re.sub("http://","",str(lk))
        lk = re.sub(" ","",str(lk))
        # define url sets
        url_ending_set = ['brewing.ie','brewing.com','brewery.com','brewery.ie','beer.com','beer.ie','.com','.ie']
        social_ending_set = ['','brew','brewery','brewing','beer','craft','brewing']
        # main url
        if url_boolean == False:
            #site_url = str("www."+brew_name+".ie")
            site_url = str("www.google.com")
            # loop through possible endings
            for ending in url_ending_set:
                # check brew name to find url
                check_this_url = str(brew_name+ending)
                if url_similarity(check_this_url, str(lk)) > 0.75:
                    site_url = lk
                    url_boolean = True
                    break        
        # Twitter
        if twitter_boolean == False:
            twitter_site_call = "twitter.com"
            #twitter_url = str("www."+twitter_site_call+"/"+brew_name)
            twitter_url = str("www.twitter.com")
            # loop through possible endings
            for ending in social_ending_set:
                # check brew name to find url
                check_this_url = str(twitter_site_call+"/"+brew_name+""+ending+"")
                if url_similarity(check_this_url, str(lk)) > 0.9:
                    twitter_url = lk
                    twitter_boolean = True
                    break
        # Facebook
        if facebook_boolean == False:
            facebook_site_call = "facebook.com"
            #facebook_url = str("www."+facebook_site_call+"/"+brew_name)
            facebook_url = str("www.facebook.com")
            # loop through possible endings
            for ending in social_ending_set:
                # check brew name to find url
                check_this_url = str(facebook_site_call+"/"+brew_name+""+ending+"")
                if url_similarity(check_this_url, str(lk)) > 0.9:
                    facebook_url = lk
                    facebook_boolean = True
                    break
                    
    # create dictionary for return
    url_content = {
        'url':site_url,
        'twitter':twitter_url,
        'facebook':facebook_url
    }
    return url_content


###########################
## cleaning methods
# url similarity measure
def url_similarity(url_a, url_b):
    return SequenceMatcher(None, url_a, url_b).ratio()

# cleans address
def parseAddress(address):
    return re.sub("\s\s+","", re.sub('"','', re.sub('"langaddress": ','',address)))

# coverts string to float
def parseFloat(digit):
    rtn_flo = re.findall("\d+\.\d+", digit)
    if len(rtn_flo)>=1:
        rtn = rtn_flo[0]
    else:
        rtn = 0.0
    return float(rtn)

# removes unwanted data from html
def find_between(s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

# gets type from array
def find_brew_type(look_up):
    value = 0
    for i,d in enumerate(look_up):
        if "rew" in d:
            value = i
    return value

#########################
# creates dictionary
def pull_data(new_str):
    ads = "none"
    lon = 0.0
    lat = 0.0
    
    for i, d in enumerate(new_str.split("\n")):
        if "langaddress" in d:
            ads = parseAddress(d)
            
        if "lon" in d:
            lon = parseFloat(d)
            
        if "lat" in d:
            lat = parseFloat(d)
            
    # create dict
    content = {
        "address":ads,
        "long": lon,
        "lati": lat
    }
    
    # return
    return content

def get_loc(name,area):
    from googleplaces import GooglePlaces, types, lang
    YOUR_API_KEY = "AIzaSyDFK8QRiUl8jx5YYQwDMQ31GMyXwXz-et8"
    google_places= GooglePlaces(YOUR_API_KEY)
    query_result = google_places.nearby_search(
            location= str(area) + ', Ireland', 
            keyword = str(name),
            radius  = 25000
    )
    content = None
    for place in query_result.places:
        loc_data = place.geo_location
    for place in query_result.places:
        if place.name is None:
            # create dict
            content = {
                "address": place.name+","+area+",Ireland",
                "long": loc_data['lng'],
                "lati": loc_data['lat']
            }
        break
    # return
    return content
        
########################
# data entry prep
# gets long lats from nominatim
def make_data(url_link):
    url_link = url_link.replace(" ", "+")
    page = make_soup(url_link)
    section = page.find_all('script')[0]
    data = find_between(str(section),"var nominatim_results = [","\"importance") + " }"
    return data

# cleans and formats
def complete_data(size):
    count = 0
    # url formating
    site_url = "https://nominatim.openstreetmap.org/search.php?q="
    country = "Ireland"
    ending = "&polygon_geojson=1&viewbox="
    spacing = "%2C"
    
    # parse table data
    table_data = parse_table(ratebeer_Ireland_URL, size)
    # data entry
    brew_data = []
    for z,d in enumerate(table_data):
        # testing
        if z%5==0:
           time.sleep(1.5)
        # gather data
        brewery_name = d['name']
        brewery_town = re.split(" - ",d['region'])[0]
        brewery_type = d['type']
        brewery_URL = d['url']
        brewery_twitter = d['twitter'],
        brewery_facebook = d['facebook']
        
        cut_data = get_loc(brewery_name,brewery_town)
        if cut_data is None:
            # create failsafe
            URLA = site_url+brewery_name+brewery_town+spacing+country+ending
            URLB = site_url+brewery_town+spacing+country+ending
            newSTR = make_data(URLA)
            # check urls
            if len(newSTR) < 5:
                newSTR = make_data(URLB)
                cut_data = pull_data(newSTR)
            else:
                cut_data = pull_data(newSTR)
            
        address_data = cut_data['address'].split(",")
        # create dict for db entry
        model_dictionary = {
            'name':brewery_name,
            'region':address_data[0],
            'address':cut_data['address'],
            'type':brewery_type,
            'lati':cut_data['lati'],
            'long':cut_data['long'],
            'url':brewery_URL,
            'twitter':brewery_twitter,
            'facebook':brewery_facebook,
            'tour':True,
            'merch':True
        }
        # creates dict
        brew_data.append(model_dictionary)
    # return
    print("complete data: done...")
    return brew_data

In [39]:
brew_data = complete_data(150)

Starting...
table data: done...
complete data: done...


In [14]:
def write_file(data):
    wr_file = open('json_data_base.txt', 'w')
    for d in data:
        wr_file.write(str(d)+"\n")
    wr_file.close()
    return "...finished writing data."
    
write_file(brew_data)

'...finished writing data.'