In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

In [2]:
import requests

page = 1
responses = []
url = "https://api.openbrewerydb.org/v1/breweries"
payload = {'page': page, 'per_page': 125, 'country': 'United States'}
headers = {}

# iterate over 50 pages pulling 125 records for each page
while page < 50:
    page = page + 1
    response = requests.request("GET", url, headers=headers, data=payload).json()
    responses.extend(response)
# convert the list of responses to a pandas dataframe
frames = []
for r in responses:
    frames.append(r)
breweries = pd.DataFrame(frames)
# save the dataframe to memory as a csv
gdf = gpd.GeoDataFrame(breweries,
                       geometry=gpd.points_from_xy(
                           x=breweries.longitude,
                           y=breweries.latitude,
                           crs='EPSG:4326'))
print(f"no. of rows: {len(gdf)} rows")

no. of rows: 6125 rows


In [3]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 6125 entries, 0 to 6124
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   id              6125 non-null   object  
 1   name            6125 non-null   object  
 2   brewery_type    6125 non-null   object  
 3   address_1       5733 non-null   object  
 4   address_2       49 non-null     object  
 5   address_3       0 non-null      object  
 6   city            6125 non-null   object  
 7   state_province  6125 non-null   object  
 8   postal_code     6125 non-null   object  
 9   country         6125 non-null   object  
 10  longitude       4508 non-null   object  
 11  latitude        4508 non-null   object  
 12  phone           5537 non-null   object  
 13  website_url     4998 non-null   object  
 14  state           6125 non-null   object  
 15  street          5733 non-null   object  
 16  geometry        6125 non-null   geometry
dtypes: geo

In [4]:
gdf.isnull().sum()

id                   0
name                 0
brewery_type         0
address_1          392
address_2         6076
address_3         6125
city                 0
state_province       0
postal_code          0
country              0
longitude         1617
latitude          1617
phone              588
website_url       1127
state                0
street             392
geometry             0
dtype: int64

In [5]:
def cleaner(gdf:gpd.GeoDataFrame):
    
    def drop_unwanted(gdf:gpd.GeoDataFrame, cols:list):
        return gdf.drop(columns=cols, axis=1)
    
    cols = ['address_1', 'address_2', 'address_3', 'id', 'website_url', 'phone', 'state_province']
    gdf = drop_unwanted(gdf=gdf, cols=cols)
    
    def remove_nan(gdf=gpd.GeoDataFrame):
        return gdf.dropna(axis=0)
    
    gdf = remove_nan(gdf=gdf)
    
    def tweak_dtypes(gdf:gpd.GeoDataFrame, type_dict:dict):
        return gdf.astype(type_dict)
    
    type_dictionary = {
        'name': 'string',
        'brewery_type': 'string',
        'city': 'string',
        'postal_code': 'string',
        'country': 'string',
        'longitude': 'float64',
        'latitude': 'float64',
        'state': 'string',
        'street': 'string',
        'geometry': 'geometry'
    }
    
    gdf = tweak_dtypes(gdf=gdf, type_dict=type_dictionary)
        
    print(f"rows and columns count: {gdf.shape}")
    print(f"list of columns: {list(gdf.columns)}")
    print(f"no. rows with missing values: {gdf.isnull().sum()}")
    print(gdf.dtypes)
    
    return gdf
    
gdf = cleaner(gdf=gdf)
    

rows and columns count: (4312, 10)
list of columns: ['name', 'brewery_type', 'city', 'postal_code', 'country', 'longitude', 'latitude', 'state', 'street', 'geometry']
no. rows with missing values: name            0
brewery_type    0
city            0
postal_code     0
country         0
longitude       0
latitude        0
state           0
street          0
geometry        0
dtype: int64
name            string[python]
brewery_type    string[python]
city            string[python]
postal_code     string[python]
country         string[python]
longitude              float64
latitude               float64
state           string[python]
street          string[python]
geometry              geometry
dtype: object


In [6]:
gdf

Unnamed: 0,name,brewery_type,city,postal_code,country,longitude,latitude,state,street,geometry
0,(405) Brewing Co,micro,Norman,73069-8224,United States,-97.468182,35.257389,Oklahoma,1716 Topeka St,POINT (-97.46818 35.25739)
2,10 Barrel Brewing Co,large,San Diego,92101-6618,United States,-117.129593,32.714813,California,1501 E St,POINT (-117.12959 32.71481)
3,10 Barrel Brewing Co,large,Bend,97701-9847,United States,-121.281706,44.086835,Oregon,62970 18th St,POINT (-121.28171 44.08684)
4,10 Barrel Brewing Co,large,Bend,97703-2465,United States,-121.328802,44.057565,Oregon,1135 NW Galveston Ave Ste B,POINT (-121.32880 44.05756)
5,10 Barrel Brewing Co,large,Portland,97209-2620,United States,-122.685506,45.525979,Oregon,1411 NW Flanders St,POINT (-122.68551 45.52598)
...,...,...,...,...,...,...,...,...,...,...
6119,49th State Brewing Co,micro,Healy,99743,United States,-149.017877,63.864759,Alaska,248.4 Parks Hwy 5 Mile,POINT (-149.01788 63.86476)
6120,49th State Brewing Co - Anchorage,brewpub,Anchorage,99501-2104,United States,-149.895820,61.219737,Alaska,717 W 3rd Ave,POINT (-149.89582 61.21974)
6121,4B's Brewery,brewpub,Cedaredge,81413-3339,United States,-107.925782,38.900614,Colorado,215 W Main St,POINT (-107.92578 38.90061)
6123,4kd Crick Brewery,brewpub,Defiance,43512-1718,United States,-84.341647,41.296252,Ohio,211 Carpenter Rd,POINT (-84.34165 41.29625)


In [7]:
gdf.to_file('breweries_US_clean.shp')

  gdf.to_file('breweries_US_clean.shp')


Cleaning DONE!!