In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geoplot as gplt
import shapefile
import osr
import dbf
import requests
import io

from urllib.request import urlopen
from zipfile import ZipFile
from shapely.geometry import shape, Point, Polygon


%matplotlib inline

### Weighted Centroids of several States

In [2]:
## Function definition: Read Blocks Shapefile within a State
def Blocks_Shapefile(doc_path):
    
    state_blocks = ZipFile(doc_path, 'r') 

    filenames = [y for y in sorted(state_blocks.namelist())
                 for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)]
    dbf, prj, shp, shx = [io.BytesIO(state_blocks.read(filename)) for filename in filenames]
    r = shapefile.Reader(shp=shp, shx=shx, dbf=dbf)

    attributes, geometry = [], []
    field_names = [field[0] for field in r.fields[1:]]
    for row in r.shapeRecords():
        geometry.append(shape(row.shape.__geo_interface__))
        attributes.append(dict(zip(field_names,row.record)))

    prj = io.TextIOWrapper(prj, encoding='utf-8')
    proj4 = osr.SpatialReference(prj.read()).ExportToProj4()

    gdf = gpd.GeoDataFrame(data=attributes, geometry=geometry, crs=proj4)
    gdf[['INTPTLON10', 'INTPTLAT10']] = gdf[['INTPTLON10', 'INTPTLAT10']].apply(pd.to_numeric)
    gdf.sort_values(['COUNTYFP10', 'BLOCKCE10', 'TRACTCE10'], ascending=[True, True, True], inplace=True)
    gdf.reset_index(drop=True, inplace=True)
    
    return gdf;

## gdf01 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_01_tabblock10.zip')

In [3]:
## Function definition: Read Population by Blocks within a State
def Blocks_Population(doc_path):
    
    pop = pd.read_csv(doc_path, header=1)
    pop['GEOID10'] = pop['id'].map(lambda x: x[9:])
    ### cols = pop.columns.tolist()   ## ['id', 'Geographic Area Name', 'Total', 'GEOID10']
    pop = pop[['id', 'Geographic Area Name', 'GEOID10', 'Total']]
    
    return pop;

## pop01 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_01.csv')

In [4]:
## Function definition: Read County Shapefile of USA
def USA_County_Shapefile(doc_path):
    
    allcounties = ZipFile(doc_path, 'r')

    filenames = [y for y in sorted(allcounties.namelist())
                     for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)]
    dbf, prj, shp, shx = [io.BytesIO(allcounties.read(filename)) for filename in filenames]
    r = shapefile.Reader(shp=shp, shx=shx, dbf=dbf)

    attributes, geometry = [], []
    field_names = [field[0] for field in r.fields[1:]]
    for row in r.shapeRecords():
        geometry.append(shape(row.shape.__geo_interface__))
        attributes.append(dict(zip(field_names,row.record)))

    prj = io.TextIOWrapper(prj, encoding='utf-8')
    proj4 = osr.SpatialReference(prj.read()).ExportToProj4()

    gdf = gpd.GeoDataFrame(data=attributes, geometry=geometry, crs=proj4)
    gdf.sort_values(by =['STATEFP10', 'COUNTYFP10'], inplace=True)
    gdf.reset_index(drop=True, inplace=True)
    gdf[['INTPTLON10', 'INTPTLAT10']] = gdf[['INTPTLON10', 'INTPTLAT10']].apply(pd.to_numeric)

    gdf = gdf[(gdf.STATEFP10 != '02') & (gdf.STATEFP10 != '72') & (gdf.STATEFP10 != '15')]
    
    return gdf;

## allcounties = USA_County_Shapefile('/home/jinli/PycharmProjects/tl_2010_us_county10(NEW).zip')

In [5]:
gdf01 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_01_tabblock10.zip')
gdf12 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_12_tabblock10.zip')
gdf13 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_13_tabblock10.zip')
gdf28 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_28_tabblock10.zip')

pop01 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_01.csv')
pop12 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_12.csv')
pop13 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_13.csv')
pop28 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_28.csv')

In [6]:
gdf = pd.concat([gdf01, gdf12, gdf13, gdf28])
pop = pd.concat([pop01, pop12, pop13, pop28])

In [7]:
geodata = pd.merge(gdf, pop, on='GEOID10')
geodata = geodata[['STATEFP10', 'COUNTYFP10', 'GEOID10', 'Total', 'INTPTLON10', 'INTPTLAT10']]

geodata['LON*POP'] = geodata['Total']*geodata['INTPTLON10']
geodata['LAT*POP'] = geodata['Total']*geodata['INTPTLAT10']

## Calculation of population weighted centroids for each county
gdf_bycounty = geodata.groupby(['STATEFP10', 'COUNTYFP10'])['Total', 'LON*POP', 'LAT*POP'].sum().reset_index()
gdf_bycounty['LON'] = gdf_bycounty['LON*POP']/gdf_bycounty['Total']
gdf_bycounty['LAT'] = gdf_bycounty['LAT*POP']/gdf_bycounty['Total']

### Creat new geodataframe with centroid points transfromed to geometry
geometry = [Point(xy) for xy in zip(gdf_bycounty['LON'], gdf_bycounty['LAT'])]
cent = gpd.GeoDataFrame(gdf_bycounty, geometry=geometry)

cent['GEOID10'] = cent['STATEFP10'] + cent['COUNTYFP10']
GEOID10 = cent['GEOID10']
cent.drop(labels=['GEOID10'], axis=1, inplace = True)
cent.insert(0, 'GEOID10', GEOID10)

  


### Shapefile of all Counties in USA

In [8]:
allcounties = USA_County_Shapefile('/home/jinli/PycharmProjects/tl_2010_us_county10(NEW).zip')

### County Pairs

In [9]:
countypairs = pd.read_csv('/home/jinli/PycharmProjects/county-pair-list.txt')

countypairs.drop_duplicates(subset='COUNTYPAIR_ID', inplace = True)

new = countypairs['COUNTYPAIR_ID'].str.split("-", n = 1, expand = True)

countypairs['GEOID10_FIPS1'] = new[0]
countypairs['GEOID10_FIPS2'] = new[1]
countypairs['STATE_FIPS1'] = countypairs['GEOID10_FIPS1'].map(lambda x: x[0:2])
countypairs['STATE_FIPS2'] = countypairs['GEOID10_FIPS2'].map(lambda x: x[0:2])

countypairs = countypairs[['COUNTYPAIR_ID', 'STATE_FIPS1', 'GEOID10_FIPS1', 'STATE_FIPS2', 'GEOID10_FIPS2']]
countypairs.reset_index(drop=True, inplace=True)

In [10]:
countypairs

Unnamed: 0,COUNTYPAIR_ID,STATE_FIPS1,GEOID10_FIPS1,STATE_FIPS2,GEOID10_FIPS2
0,01003-12033,01,01003,12,12033
1,01005-13239,01,01005,13,13239
2,01017-13145,01,01017,13,13145
3,01019-13233,01,01019,13,13233
4,01023-28023,01,01023,28,28023
...,...,...,...,...,...
1176,56037-49043,56,56037,49,49043
1177,56039-16019,56,56039,16,16019
1178,56039-16081,56,56039,16,16081
1179,56041-49043,56,56041,49,49043


### Boundaries shared by each county pairs

In [11]:
### County pairs in State 01,12,13,28
cp = countypairs[countypairs.STATE_FIPS1=='01']

### Geodataframe of all counties in USA
gdf_ac = allcounties[['GEOID10', 'INTPTLAT10', 'INTPTLON10', 'geometry']]

### County Boundary Intersection: cbi
GEOID10_FIPS1 = cp[['GEOID10_FIPS1']]
GEOID10_FIPS2 = cp[['GEOID10_FIPS2']]
cb_GEOID10_FIPS1 = pd.merge(GEOID10_FIPS1, gdf_ac, how='left', left_on='GEOID10_FIPS1', right_on='GEOID10')
cb_GEOID10_FIPS2 = pd.merge(GEOID10_FIPS2, gdf_ac, how='left', left_on='GEOID10_FIPS2', right_on='GEOID10')
cb_FIPS1 = cb_GEOID10_FIPS1[['geometry']].rename(columns={'geometry': 'geometry_FIPS1'})
cb_FIPS2 = cb_GEOID10_FIPS2[['geometry']].rename(columns={'geometry': 'geometry_FIPS2'})

### County pairs geometries
cbp = pd.concat([cb_FIPS1, cb_FIPS2], axis=1)

In [12]:
cbi = pd.DataFrame(columns=['intersection'])

for index, row in cbp.iterrows():
    intersection = row['geometry_FIPS1'].intersection(row['geometry_FIPS2'])
    cbi = cbi.append({'intersection': intersection}, ignore_index=True)

### County pairs and the shared boundaries   
geocbi = gpd.GeoDataFrame(cp, geometry=cbi.intersection)

### Dataframe that contains weighted centroids coordinates of all county in USA
geocent = cent[['GEOID10', 'geometry']]

### Joining dataframes to form a new one which include shared boundaries & centroids information
distance_info = pd.merge(geocbi, geocent, how='left', left_on='GEOID10_FIPS1', right_on='GEOID10')
distance_info = pd.merge(distance_info, geocent, how='left', left_on='GEOID10_FIPS2', right_on='GEOID10')
distance_info = distance_info.rename(columns={'geometry_x': 'Intersection', 
                                              'geometry_y': 'Cent_FIPS1', 'geometry': 'Cent_FIPS2'})
distance_info.drop(['GEOID10_x', 'GEOID10_y'], axis=1, inplace=True)

In [13]:
distance_info.head()

Unnamed: 0,COUNTYPAIR_ID,STATE_FIPS1,GEOID10_FIPS1,STATE_FIPS2,GEOID10_FIPS2,Intersection,Cent_FIPS1,Cent_FIPS2
0,01003-12033,1,1003,12,12033,"MULTILINESTRING ((-87.59883 30.99745, -87.5988...",POINT (-87.76247 30.54892),POINT (-87.27480 30.48531)
1,01005-13239,1,1005,13,13239,"MULTILINESTRING ((-85.06823 31.99186, -85.0682...",POINT (-85.30993 31.84404),POINT (-85.07773 31.85938)
2,01017-13145,1,1017,13,13145,"MULTILINESTRING ((-85.18413 32.87053, -85.1845...",POINT (-85.26652 32.86044),POINT (-84.89505 32.69993)
3,01019-13233,1,1019,13,13233,"MULTILINESTRING ((-85.42185 34.08082, -85.4209...",POINT (-85.62920 34.17933),POINT (-85.17677 34.01088)
4,01023-28023,1,1023,28,28023,"MULTILINESTRING ((-88.44169 32.14641, -88.4416...",POINT (-88.26410 32.01656),POINT (-88.73063 32.06275)


### Distance between population weighted county centroids and boundaries

In [21]:
distance_FIPS1, distance_FIPS2 = [], []

for index, row in distance_info.iterrows():
    ##points.distance(lines)
    dist1 = row['Cent_FIPS1'].distance(row['Intersection'])
    dist2 = row['Cent_FIPS2'].distance(row['Intersection'])
    distance_FIPS1.append(dist1)
    distance_FIPS2.append(dist2)
    
dist_FIPS1 = pd.DataFrame({'distance_FIPS1':distance_FIPS1})
dist_FIPS2 = pd.DataFrame({'distance_FIPS2':distance_FIPS2})

distance = pd.concat([distance_info, dist_FIPS1, dist_FIPS2], axis=1)

In [22]:
distance.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 34 entries, 0 to 33
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   COUNTYPAIR_ID   34 non-null     object  
 1   STATE_FIPS1     34 non-null     object  
 2   GEOID10_FIPS1   34 non-null     object  
 3   STATE_FIPS2     34 non-null     object  
 4   GEOID10_FIPS2   34 non-null     object  
 5   Intersection    34 non-null     geometry
 6   Cent_FIPS1      34 non-null     geometry
 7   Cent_FIPS2      34 non-null     geometry
 8   distance_FIPS1  34 non-null     float64 
 9   distance_FIPS2  34 non-null     float64 
dtypes: float64(2), geometry(3), object(5)
memory usage: 2.9+ KB
