In [16]:
import os
import urllib
import requests
import json

from mysklearn.mypytable import MyPyTable

## Getting More Brewery Data

We need more attributes to train on, because our accuracy is too low. We can use the Google Places API to gather more data on the breweries. The goal is to collect data on the location of the brewery, the price level, and the rating. Then we will store the data in a csv file named `places_data.csv`.

The first step is to build a list of the unique brewery names. This can be done with the following code.

In [40]:
fpath_reviews = os.path.join("Data", "beer_reviews_cleaned.csv")
table = MyPyTable().load_from_file(fpath_reviews)

col_index = table.get_column_index("brewery_name")

unique_names = {}
for row in table.data:
    if row[col_index] not in unique_names:
        unique_names[row[col_index]] = 0
# print(unique_names)


Next, we will start searching the API. We need to be careful not to make too many requests, so we are going to do them bit by bit, saving them into a file as we go. This means that we need to be careful not to make repeat requests. The API key is stored in a separate file named `api_key.txt`, which will not be uploaded to GitHub. To run this you can get an API key from Google Places, or you can see the results of this in the `places_data.csv` file.

In [41]:
# load table
fpath_breweries = os.path.join("Data", "places_data.csv")
places_table = MyPyTable().load_from_file(fpath_breweries)

names_done = places_table.get_column("brewery_name")
names_done_dict = {}
for name in names_done:
    names_done_dict[name] = 0

api_key_file = open("api_key.txt", "r")
api_key = api_key_file.read()

i = 0
for name in unique_names:
    if name not in names_done_dict:
        url = 'https://maps.googleapis.com/maps/api/place/findplacefromtext/json?'
        url += 'fields=name%2Cformatted_address%2Cprice_level%2Crating'
        url += '&input='
        url += urllib.parse.quote(name)
        url += '&inputtype=textquery'
        url += '&key=' + api_key
        
        payload = {}
        headers = {}

        response = requests.request("GET", url, headers=headers, data=payload)

        # print("status code:", response.status_code)  # should be status code 200
        if response.status_code == 200:
            # OK
            # parse the message body JSON
            json_obj = json.loads(response.text)
            candidates = json_obj['candidates']
            if not candidates:
                row = [name, '', '', '']
            else:
                brewery_dict = candidates[0]
                # store the data in a row in places_table
                row = [name]
                # not all attributes are available for every request
                if 'formatted_address' in brewery_dict:
                    row.append(brewery_dict['formatted_address'])
                else:
                    row.append('')
                if 'price_level' in brewery_dict:
                    row.append(brewery_dict['price_level'])
                else:
                    row.append('')
                if 'rating' in brewery_dict:
                    row.append(brewery_dict['rating'])
                else:
                    row.append('')
            # add row to places_table
            places_table.data.append(row)

        # stop after n requests
        if i < 1000:
            i += 1
        else:
            print("limit reached")
            break

# save places_table
places_table.save_to_file("Data/places_data.csv")