# Geo and Reverse_Gecoding using Pygeocoder library

##### Pygeocoder documentation: 
+  https://chrisalbon.com/python/data_wrangling/geocoding_and_reverse_geocoding/
+  https://bitbucket.org/xster/pygeocoder/wiki/Home 

##### Data file Source:  Ny Citi Bike Trip Histories from https://www.citibikenyc.com/system-data

##### Files Extracted:

|  File Name | URL  |
|---|---|
| 202001-citibike-tripdata.csv.zip  |  https://s3.amazonaws.com/tripdata/202001-citibike-tripdata.csv.zip |




In [1]:
import pandas as pd
from pygeocoder import Geocoder
import numpy as np
from config import api_key
#

In [2]:
#google_api

## A. Process Jan 2020 bike file and add values

### A-1.  Read the Bike File(s)

In [2]:
#bike_file = "citi-bike-data/202001-citibike-tripdata.csv"
wine_file = "winerygeodata.csv"
wine_df = pd.read_csv(wine_file)
wine_df.head()

FileNotFoundError: [Errno 2] File b'winerygeodata.csv' does not exist: b'winerygeodata.csv'

In [4]:
bike_df.shape

(49, 15)

## B.  Station Table

### B-1.  Create a table of station id's and names from start and end stations

#### extract a start stations df

In [5]:
station1_df = bike_df[['start station id', 'start station name', 'start station latitude',  'start station longitude']]
station1_df.columns = ['station id', 'station name', 'latitude',  'longitude']
station1_df.head()

Unnamed: 0,station id,station name,latitude,longitude
0,504,1 Ave & E 16 St,40.732219,-73.981656
1,3423,West Drive & Prospect Park West,40.661063,-73.979453
2,3687,E 33 St & 1 Ave,40.743227,-73.974498
3,346,Bank St & Hudson St,40.736529,-74.00618
4,372,Franklin Ave & Myrtle Ave,40.694546,-73.958014


In [6]:
station1_df.shape

(49, 4)

#### extract a end stations df

In [7]:
station2_df = bike_df[['end station id', 'end station name', 'end station latitude', 'end station longitude']]
station2_df.columns = ['station id', 'station name', 'latitude',  'longitude']
station2_df.head()

Unnamed: 0,station id,station name,latitude,longitude
0,307,Canal St & Rutgers St,40.714275,-73.9899
1,3300,Prospect Park West & 8 St,40.665147,-73.976376
2,259,South St & Whitehall St,40.701221,-74.012342
3,490,8 Ave & W 33 St,40.751551,-73.993934
4,3637,Fulton St & Waverly Ave,40.683239,-73.965996


In [8]:
station2_df.shape

(49, 4)

In [9]:
# station1_df['station id'].value_counts()

#### Concat start and end station df's to form 1 list

In [10]:
station_df = pd.concat([station1_df, station2_df], ignore_index=True)

In [11]:
station_df.head()

Unnamed: 0,station id,station name,latitude,longitude
0,504,1 Ave & E 16 St,40.732219,-73.981656
1,3423,West Drive & Prospect Park West,40.661063,-73.979453
2,3687,E 33 St & 1 Ave,40.743227,-73.974498
3,346,Bank St & Hudson St,40.736529,-74.00618
4,372,Franklin Ave & Myrtle Ave,40.694546,-73.958014


In [12]:
station_df.shape

(98, 4)

#### need to reduce to 1 record per station

In [13]:
station_df['station id'].value_counts()

3349    5
513     3
514     3
157     2
3506    2
       ..
3160    1
3310    1
2012    1
3165    1
128     1
Name: station id, Length: 78, dtype: int64

In [14]:
#### check a value (there many hits)
station_df[station_df['station id'] == 519]

Unnamed: 0,station id,station name,latitude,longitude


#### drop duplictes

In [15]:
# drop duplicates 
station_df.drop_duplicates(subset=['station id', 'station name'], keep='first', inplace=True)
station_df.reset_index(drop=True, inplace=True)
station_df['station id'].value_counts()

157     1
405     1
3236    1
3494    1
3368    1
       ..
3310    1
2012    1
3165    1
478     1
128     1
Name: station id, Length: 78, dtype: int64

In [16]:
station_df.shape

(78, 4)

In [17]:
#### check a value - only 1 value
station_df[station_df['station id'] == 519]

Unnamed: 0,station id,station name,latitude,longitude


In [18]:
#station_df.tail()
station_df.loc[0:1]


Unnamed: 0,station id,station name,latitude,longitude
0,504,1 Ave & E 16 St,40.732219,-73.981656
1,3423,West Drive & Prospect Park West,40.661063,-73.979453


### B-2.  Add zip code and other loction data to the station data using reverse geocoder and lat/long data

In [19]:
def reverseGeo(lat, lon) :
    
    results = Geocoder(google_api).reverse_geocode(lat, lon)
#     geoDict = {"zipcode" : results.postal_code, "burrough" : results.administrative_area_level_2}
    
    return(results.postal_code, 
           results.sublocality, 
           results.city, 
           results.administrative_area_level_2, 
           results.administrative_area_level_1,
           results.formatted_address
          )

In [20]:
test_df = station_df[0:1]
test_df

Unnamed: 0,station id,station name,latitude,longitude
0,504,1 Ave & E 16 St,40.732219,-73.981656


In [21]:
## test the function 

test_df = pd.DataFrame(station_df[1:2])

test_df[["zipcode", "burrough", "city", "county", "state", "address"]] =test_df.apply(lambda x: pd.Series(reverseGeo(x.latitude, x.longitude)), axis=1)

test_df

Unnamed: 0,station id,station name,latitude,longitude,zipcode,burrough,city,county,state,address
1,3423,West Drive & Prospect Park West,40.661063,-73.979453,11215,Brooklyn,,Kings County,New York,"West Drive & Prospect Park West, Brooklyn, NY ..."


In [22]:
station_df[["zipcode", "burrough", "city", "county", "state", "address"]] = station_df.apply(lambda x: pd.Series(reverseGeo(x.latitude, x.longitude)), axis=1)

In [23]:
station_df

Unnamed: 0,station id,station name,latitude,longitude,zipcode,burrough,city,county,state,address
0,504,1 Ave & E 16 St,40.732219,-73.981656,10009,Manhattan,New York,New York County,New York,"1 Ave & E 16 St, New York, NY 10009, USA"
1,3423,West Drive & Prospect Park West,40.661063,-73.979453,11215,Brooklyn,,Kings County,New York,"West Drive & Prospect Park West, Brooklyn, NY ..."
2,3687,E 33 St & 1 Ave,40.743227,-73.974498,10016,Manhattan,New York,New York County,New York,"E 33 St & 1 Ave, New York, NY 10016, USA"
3,346,Bank St & Hudson St,40.736529,-74.006180,10014,Manhattan,New York,New York County,New York,"Bank St & Hudson St, New York, NY 10014, USA"
4,372,Franklin Ave & Myrtle Ave,40.694546,-73.958014,11205,Brooklyn,,Kings County,New York,"Franklin Ave & Myrtle Ave, Brooklyn, NY 11205,..."
...,...,...,...,...,...,...,...,...,...,...
73,472,E 32 St & Park Ave,40.745712,-73.981948,10016,Manhattan,New York,New York County,New York,"E 32 St & Park Ave, New York, NY 10016, USA"
74,387,Centre St & Chambers St,40.712733,-74.004607,10007,Manhattan,New York,New York County,New York,"Centre St & Chambers St, New York, NY 10007, USA"
75,362,Broadway & W 37 St,40.751726,-73.987535,10018,Manhattan,New York,New York County,New York,"Broadway & W 37 St, New York, NY 10018, USA"
76,3416,7 Ave & Park Pl,40.677615,-73.973243,11217,Brooklyn,,Kings County,New York,"7 Ave & Park Pl, Brooklyn, NY 11217, USA"


### experiment with google geocder. api

#### test the geocoder reverse_gecode API

In [24]:
lat = bike_df['start station latitude'][0]
lon = bike_df['start station longitude'][0]

print(f"Lat: {lat}, Lon: {lon}")

Lat: 40.73221853, Lon: -73.98165557


In [25]:
results = Geocoder(google_api).reverse_geocode(lat, lon)

In [26]:
results

<pygeolib.GeocoderResult at 0x20c4bca5080>

In [27]:
#results.raw

In [28]:
# results.formatted_address

In [29]:
# results.city

In [30]:
# burrough
# results.sublocality

In [31]:
# results.postal_code

In [32]:
# results.county

In [33]:
# county
# results.administrative_area_level_2

In [34]:
# state
# results.administrative_area_level_1

In [35]:
# results.country

In [36]:
# row_df = station_df.loc[0 : 0]
# row_df

In [37]:
# geoSeries = row_df.apply(lambda x: pd.Series(reverseGeo(x.latitude, x.longitude)), axis=1)

In [38]:
# geoSeries

In [39]:
# row_df[["zipcode", "burrough", "city", "county", "state"]] = row_df.apply(lambda x: pd.Series(reverseGeo(x.latitude, x.longitude)), axis=1)

In [40]:
# row_df

#### test the geopcode call  Get lat lon from address, ... 

In [41]:
### test the geocoder api to get coordinates from address 

#address = "4207 N Washington Ave, Douglas, AZ 85607"
address = '1 Ave & E 16 St, New York, NY 10009, USA'
results = Geocoder(google_api).geocode(address)

coords = results[0].coordinates
print(coords)
print(results[0])
print(type(coords))

lat = coords[0]
lon = coords[1]
print(lat, lon)
#geo_AddressToLatLon(address)
results.coordinates

(40.7326656, -73.9815974)
1st Avenue & E 16th St, New York, NY 10003, USA
<class 'tuple'>
40.7326656 -73.9815974


(40.7326656, -73.9815974)

In [42]:
## Functionto call geocode api
def geo_AddressToLatLon(address) : 
    ## geocode call to get lat, lon coordinates from address
    results = Geocoder(google_api).geocode(address)
    
    return (results.coordinates[0], results.coordinates[1])
                
    

In [43]:
## lambda function to get lat, lon coords across a dataframe
#test_df[["new_lat", "new_lon"]] = test_df.apply(lambda x: pd.Series(geo_AddressToLatLon(x.address)), axis=1)

station_df[["new_lat", "new_lon"]] = station_df.apply(lambda x: pd.Series(geo_AddressToLatLon(x.address)), axis=1)

station_df

Unnamed: 0,station id,station name,latitude,longitude,zipcode,burrough,city,county,state,address,new_lat,new_lon
0,504,1 Ave & E 16 St,40.732219,-73.981656,10009,Manhattan,New York,New York County,New York,"1 Ave & E 16 St, New York, NY 10009, USA",40.732666,-73.981597
1,3423,West Drive & Prospect Park West,40.661063,-73.979453,11215,Brooklyn,,Kings County,New York,"West Drive & Prospect Park West, Brooklyn, NY ...",40.668216,-73.973828
2,3687,E 33 St & 1 Ave,40.743227,-73.974498,10016,Manhattan,New York,New York County,New York,"E 33 St & 1 Ave, New York, NY 10016, USA",40.743095,-73.974015
3,346,Bank St & Hudson St,40.736529,-74.006180,10014,Manhattan,New York,New York County,New York,"Bank St & Hudson St, New York, NY 10014, USA",40.736556,-74.005835
4,372,Franklin Ave & Myrtle Ave,40.694546,-73.958014,11205,Brooklyn,,Kings County,New York,"Franklin Ave & Myrtle Ave, Brooklyn, NY 11205,...",40.694358,-73.957998
...,...,...,...,...,...,...,...,...,...,...,...,...
73,472,E 32 St & Park Ave,40.745712,-73.981948,10016,Manhattan,New York,New York County,New York,"E 32 St & Park Ave, New York, NY 10016, USA",40.745780,-73.982219
74,387,Centre St & Chambers St,40.712733,-74.004607,10007,Manhattan,New York,New York County,New York,"Centre St & Chambers St, New York, NY 10007, USA",40.713163,-74.004120
75,362,Broadway & W 37 St,40.751726,-73.987535,10018,Manhattan,New York,New York County,New York,"Broadway & W 37 St, New York, NY 10018, USA",40.752177,-73.987537
76,3416,7 Ave & Park Pl,40.677615,-73.973243,11217,Brooklyn,,Kings County,New York,"7 Ave & Park Pl, Brooklyn, NY 11217, USA",40.677716,-73.973249


### Write out tranformed data files to read into Tableau

In [44]:
station_filepath = "data/station_data.csv"
station_df.to_csv(station_filepath, header=True, index=False)

In [45]:
station_df

Unnamed: 0,station id,station name,latitude,longitude,zipcode,burrough,city,county,state,address,new_lat,new_lon
0,504,1 Ave & E 16 St,40.732219,-73.981656,10009,Manhattan,New York,New York County,New York,"1 Ave & E 16 St, New York, NY 10009, USA",40.732666,-73.981597
1,3423,West Drive & Prospect Park West,40.661063,-73.979453,11215,Brooklyn,,Kings County,New York,"West Drive & Prospect Park West, Brooklyn, NY ...",40.668216,-73.973828
2,3687,E 33 St & 1 Ave,40.743227,-73.974498,10016,Manhattan,New York,New York County,New York,"E 33 St & 1 Ave, New York, NY 10016, USA",40.743095,-73.974015
3,346,Bank St & Hudson St,40.736529,-74.006180,10014,Manhattan,New York,New York County,New York,"Bank St & Hudson St, New York, NY 10014, USA",40.736556,-74.005835
4,372,Franklin Ave & Myrtle Ave,40.694546,-73.958014,11205,Brooklyn,,Kings County,New York,"Franklin Ave & Myrtle Ave, Brooklyn, NY 11205,...",40.694358,-73.957998
...,...,...,...,...,...,...,...,...,...,...,...,...
73,472,E 32 St & Park Ave,40.745712,-73.981948,10016,Manhattan,New York,New York County,New York,"E 32 St & Park Ave, New York, NY 10016, USA",40.745780,-73.982219
74,387,Centre St & Chambers St,40.712733,-74.004607,10007,Manhattan,New York,New York County,New York,"Centre St & Chambers St, New York, NY 10007, USA",40.713163,-74.004120
75,362,Broadway & W 37 St,40.751726,-73.987535,10018,Manhattan,New York,New York County,New York,"Broadway & W 37 St, New York, NY 10018, USA",40.752177,-73.987537
76,3416,7 Ave & Park Pl,40.677615,-73.973243,11217,Brooklyn,,Kings County,New York,"7 Ave & Park Pl, Brooklyn, NY 11217, USA",40.677716,-73.973249
