In [13]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [14]:
## Starting points to analyze access FROM
## in this case, population weighted 
## US Census tracts
origin_points = gpd.read_file('./data/contiguousUS_PopCentroid_Tracts_2010.gpkg')

## Destinations to analyize access TO
## in this case, US pharmacies
## Read in the CSV, then parse into 
## geo via gpd
df = pd.read_csv('../../data_raw/pharmacies_2019.csv')
destination_points = gpd.GeoDataFrame(
    df.drop(['Longitude', 'Latitude'], axis=1),
    crs={'init': 'epsg:4326'},
    geometry=[Point(xy) for xy in zip(df.Longitude, df.Latitude)])

## The unit of geography for our transit
## matrix, in this case tracts
destination_geographies = gpd.read_file('./data/contiguousUS_Tracts_2010.gpkg')

## The OD matrix, with pre-computed time
## or distance (aka travel cost) to and
## from relevant destinations
transit_matrix = pd.read_parquet('F:\\CSDS\\TransitMatrix\\US-matrix-TRACT-DRIVING.parquet')

  return _prepare_from_string(" ".join(pjargs))


In [25]:
## Filter the transit matrix for only relevant geographies
transit_matrix = transit_matrix[transit_matrix.origin.isin(origin_points.GEOID)]

In [19]:
## spatially join the destinations and geographies so
## we can estimate time between
merged_destinations = gpd.sjoin(destination_points, destination_geographies[['GEOID10', 'geometry']], how='inner', op='intersects')

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: +init=epsg:4326 +type=crs
Right CRS: EPSG:4326

  merged_destinations = gpd.sjoin(destination_points, destination_geographies[['GEOID10', 'geometry']], how='inner', op='intersects')


In [38]:
## Pull out the simplified columns we need for the analysis
## Type correctly to merge
destinations_simplified = merged_destinations[['index_right','GEOID10']]
destinations_simplified['GEOID10'] = destinations_simplified['GEOID10'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  destinations_simplified['GEOID10'] = destinations_simplified['GEOID10'].astype('int64')


In [39]:
## Merge onto the transit matrix, giving us the distance from each origin
## to each destination
merge_transit_matrix = transit_matrix.merge(destinations_simplified, left_on="destination", right_on="GEOID10")
merge_transit_matrix.head()

<hr/>

In [57]:
## Analayis time
## start by declaring some variables

origin_col = 'origin'
destination_id_col = 'index_right'
travel_cost_col = 'minutes'
travel_threshold = 30

In [84]:
## clean up this weird bug
## then merge the data


travel_costs = merge_transit_matrix.sort_values(travel_cost_col, ascending=True)
travel_costs.minutes = travel_costs.minutes.replace(-1000, 999)
travel_costs.origin = travel_costs.origin.astype('int64')
travel_costs.head()

Unnamed: 0,origin,destination,minutes,index_right,GEOID10
92467623,53073011000,53073000700,999.0,71646,53073000700
92446992,53073011000,53009001300,999.0,71617,53009001300
92462833,53073011000,53073000901,999.0,71668,53073000901
92462832,53073011000,53073000901,999.0,71668,53073000901
92447969,53073011000,53009001200,999.0,71625,53009001200


In [85]:
## To get the nearest location, sort the values by lowest cost
## then filter for only the first appearance of each origin ID
## by using not `.duplicated()`
time_to_nearest = travel_costs[~travel_costs.origin.duplicated()][[origin_col, travel_cost_col]]
time_to_nearest.head()

Unnamed: 0,origin,minutes
92467623,53073011000,999.0
20461349,29145020501,0.0
43468584,48427950600,0.0
20461595,29097010500,0.0
29530659,6037104822,0.0


In [86]:
## For count, we simple filter for the cost under a given threshold
## Then groupby by and count the results
count_within_threshold = travel_costs[travel_costs[travel_cost_col] <= travel_threshold] \
  .groupby(origin_col).count() \
  .reset_index()[[origin_col, travel_cost_col]]
count_within_threshold.columns = [origin_col, f"count within {travel_threshold}"]

In [87]:
merged_metrics = count_within_threshold.merge(time_to_nearest, on=origin_col, how="outer")
merged_metrics.head()

Unnamed: 0,origin,count within 30,minutes
0,1001020100,25.0,10.94
1,1001020200,39.0,7.42
2,1001020300,44.0,0.0
3,1001020400,51.0,0.0
4,1001020500,63.0,0.0


In [88]:
## To clean up any missing data, we can check back with our origin list
analyzed_origins = list(merged_metrics[origin_col])
missing_origins = [o for o in origin_points.GEOID if o not in analyzed_origins]

## Then, fill the missing data
missing_data = []
for o in missing_origins:
    missing_entry = {}
    missing_entry[origin_col] = o
    missing_entry[f"count within {travel_threshold}"]=0
    missing_entry[travel_cost_col]=None
    missing_data.append(missing_entry)
missing_df = pd.DataFrame(missing_data)

## and concatenate results
findings = pd.concat([merged_metrics, missing_df])
findings.head()

Unnamed: 0,origin,count within 30,minutes
0,1001020100,25.0,10.94
1,1001020200,39.0,7.42
2,1001020300,44.0,0.0
3,1001020400,51.0,0.0
4,1001020500,63.0,0.0


In [89]:
findings.to_csv('tract_pharmacy_access.csv', index=False)

In [90]:
findings.dtypes

origin              object
count within 30    float64
minutes             object
dtype: object