In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geoplot as gplt
import shapefile
import osr
import dbf
import requests
import io

from urllib.request import urlopen
from zipfile import ZipFile
from shapely.geometry import shape, Point, Polygon


%matplotlib inline

### Weighted Centroids of several States

In [4]:
## Function definition: Read Blocks Shapefile within a State
def Blocks_Shapefile(doc_path):
    
    state_blocks = ZipFile(doc_path, 'r') 

    filenames = [y for y in sorted(state_blocks.namelist())
                 for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)]
    dbf, prj, shp, shx = [io.BytesIO(state_blocks.read(filename)) for filename in filenames]
    r = shapefile.Reader(shp=shp, shx=shx, dbf=dbf)

    attributes, geometry = [], []
    field_names = [field[0] for field in r.fields[1:]]
    for row in r.shapeRecords():
        geometry.append(shape(row.shape.__geo_interface__))
        attributes.append(dict(zip(field_names,row.record)))

    prj = io.TextIOWrapper(prj, encoding='utf-8')
    proj4 = osr.SpatialReference(prj.read()).ExportToProj4()

    gdf = gpd.GeoDataFrame(data=attributes, geometry=geometry, crs=proj4)
    gdf[['INTPTLON10', 'INTPTLAT10']] = gdf[['INTPTLON10', 'INTPTLAT10']].apply(pd.to_numeric)
    gdf.sort_values(['COUNTYFP10', 'BLOCKCE10', 'TRACTCE10'], ascending=[True, True, True], inplace=True)
    gdf.reset_index(drop=True, inplace=True)
    
    return gdf;

## gdf01 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_01_tabblock10.zip')

In [7]:
## Function definition: Read Population by Blocks within a State
def Blocks_Population(doc_path):
    
    pop = pd.read_csv(doc_path, header=1)
    pop['GEOID10'] = pop['id'].map(lambda x: x[9:])
    ### cols = pop.columns.tolist()   ## ['id', 'Geographic Area Name', 'Total', 'GEOID10']
    pop = pop[['id', 'Geographic Area Name', 'GEOID10', 'Total']]
    
    return pop;

## pop01 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_01.csv')

In [14]:
## Function definition: Read County Shapefile of USA
def USA_County_Shapefile(doc_path):
    
    allcounties = ZipFile(doc_path, 'r')

    filenames = [y for y in sorted(allcounties.namelist())
                     for ending in ['dbf', 'prj', 'shp', 'shx'] if y.endswith(ending)]
    dbf, prj, shp, shx = [io.BytesIO(allcounties.read(filename)) for filename in filenames]
    r = shapefile.Reader(shp=shp, shx=shx, dbf=dbf)

    attributes, geometry = [], []
    field_names = [field[0] for field in r.fields[1:]]
    for row in r.shapeRecords():
        geometry.append(shape(row.shape.__geo_interface__))
        attributes.append(dict(zip(field_names,row.record)))

    prj = io.TextIOWrapper(prj, encoding='utf-8')
    proj4 = osr.SpatialReference(prj.read()).ExportToProj4()

    gdf = gpd.GeoDataFrame(data=attributes, geometry=geometry, crs=proj4)
    gdf.sort_values(by =['STATEFP10', 'COUNTYFP10'], inplace=True)
    gdf.reset_index(drop=True, inplace=True)
    gdf[['INTPTLON10', 'INTPTLAT10']] = gdf[['INTPTLON10', 'INTPTLAT10']].apply(pd.to_numeric)

    gdf = gdf[(gdf.STATEFP10 != '02') & (gdf.STATEFP10 != '72') & (gdf.STATEFP10 != '15')]
    
    return gdf;

## allcounties = USA_County_Shapefile('/home/jinli/PycharmProjects/tl_2010_us_county10(NEW).zip')

In [17]:
gdf01 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_01_tabblock10.zip')
gdf12 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_12_tabblock10.zip')
gdf13 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_13_tabblock10.zip')
gdf28 = Blocks_Shapefile('/home/jinli/PycharmProjects/tl_2010_28_tabblock10.zip')

pop01 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_01.csv')
pop12 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_12.csv')
pop13 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_13.csv')
pop28 = Blocks_Population('/home/jinli/PycharmProjects/DECENNIALSF12010.P1_data_State_28.csv')

In [18]:
gdf = pd.concat([gdf01, gdf12, gdf13, gdf28])
pop = pd.concat([pop01, pop12, pop13, pop28])

In [19]:
geodata = pd.merge(gdf, pop, on='GEOID10')
geodata = geodata[['STATEFP10', 'COUNTYFP10', 'GEOID10', 'Total', 'INTPTLON10', 'INTPTLAT10']]

geodata['LON*POP'] = geodata['Total']*geodata['INTPTLON10']
geodata['LAT*POP'] = geodata['Total']*geodata['INTPTLAT10']

## Calculation of population weighted centroids for each county
gdf_bycounty = geodata.groupby(['STATEFP10', 'COUNTYFP10'])['Total', 'LON*POP', 'LAT*POP'].sum().reset_index()
gdf_bycounty['LON'] = gdf_bycounty['LON*POP']/gdf_bycounty['Total']
gdf_bycounty['LAT'] = gdf_bycounty['LAT*POP']/gdf_bycounty['Total']

### Creat new geodataframe with centroid points transfromed to geometry
geometry = [Point(xy) for xy in zip(gdf_bycounty['LON'], gdf_bycounty['LAT'])]
cent = gpd.GeoDataFrame(gdf_bycounty, geometry=geometry)

cent['GEOID10'] = cent['STATEFP10'] + cent['COUNTYFP10']
GEOID10 = cent['GEOID10']
cent.drop(labels=['GEOID10'], axis=1, inplace = True)
cent.insert(0, 'GEOID10', GEOID10)

  


In [20]:
cent

Unnamed: 0,GEOID10,STATEFP10,COUNTYFP10,Total,LON*POP,LAT*POP,LON,LAT,geometry
0,01001,01,001,54571,-4.720074e+06,1.773579e+06,-86.494186,32.500389,POINT (-86.49419 32.50039)
1,01003,01,003,182265,-1.599603e+07,5.568000e+06,-87.762466,30.548923,POINT (-87.76247 30.54892)
2,01005,01,005,27457,-2.342355e+06,8.743417e+05,-85.309929,31.844037,POINT (-85.30993 31.84404)
3,01007,01,007,22915,-1.996530e+06,7.569036e+05,-87.127656,33.030921,POINT (-87.12766 33.03092)
4,01009,01,009,57322,-4.963592e+06,1.946382e+06,-86.591401,33.955244,POINT (-86.59140 33.95524)
...,...,...,...,...,...,...,...,...,...
370,28155,28,155,10253,-9.148357e+05,3.443185e+05,-89.226147,33.582222,POINT (-89.22615 33.58222)
371,28157,28,157,9878,-9.012517e+05,3.074185e+05,-91.238274,31.121531,POINT (-91.23827 31.12153)
372,28159,28,159,19198,-1.709445e+06,6.351644e+05,-89.042844,33.084925,POINT (-89.04284 33.08493)
373,28161,28,161,12678,-1.137134e+06,4.320738e+05,-89.693522,34.080595,POINT (-89.69352 34.08060)


In [21]:
allcounties = USA_County_Shapefile('/home/jinli/PycharmProjects/tl_2010_us_county10(NEW).zip')

In [22]:
allcounties

Unnamed: 0,STATEFP10,COUNTYFP10,COUNTYNS10,GEOID10,LSAD10,CLASSFP10,MTFCC10,CSAFP10,CBSAFP10,METDIVFP10,FUNCSTAT10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,geometry
0,01,001,00161526,01001,06,H1,G4020,388,33860,,A,1539582278,25775735,32.536382,-86.644490,"POLYGON ((-86.62619 32.70638, -86.62498 32.706..."
1,01,003,00161527,01003,06,H1,G4020,380,19300,,A,4117521611,1133190229,30.659218,-87.746067,"POLYGON ((-87.61542 31.04100, -87.61542 31.040..."
2,01,005,00161528,01005,06,H1,G4020,,21640,,A,2291818968,50864716,31.870670,-85.405456,"POLYGON ((-85.62028 31.94550, -85.61987 31.945..."
3,01,007,00161529,01007,06,H1,G4020,142,13820,,A,1612480789,9289057,33.015893,-87.127147,"POLYGON ((-87.02561 33.17915, -87.02562 33.178..."
4,01,009,00161530,01009,06,H1,G4020,142,13820,,A,1669961855,15157440,33.977448,-86.567246,"POLYGON ((-86.74361 33.83124, -86.74478 33.831..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3138,56,037,01609192,56037,06,H1,G4020,,40540,,A,27004896786,166887494,41.660339,-108.875676,"POLYGON ((-108.30022 41.00007, -108.30142 41.0..."
3139,56,039,01605083,56039,06,H1,G4020,,27220,,A,10347983791,572265729,44.049321,-110.588102,"POLYGON ((-110.96765 43.31587, -110.97132 43.3..."
3140,56,041,01605084,56041,06,H1,G4020,,21740,,A,5390449560,16341665,41.284726,-110.558947,"POLYGON ((-110.60680 40.99628, -110.60688 40.9..."
3141,56,043,01605085,56043,06,H1,G4020,,,,A,5797814617,10762355,43.878830,-107.669052,"POLYGON ((-107.63349 43.64634, -107.63352 43.6..."


### County Pairs

In [23]:
countypairs = pd.read_csv('/home/jinli/PycharmProjects/county-pair-list.txt')

countypairs.drop_duplicates(subset='COUNTYPAIR_ID', inplace = True)

new = countypairs['COUNTYPAIR_ID'].str.split("-", n = 1, expand = True)

countypairs['GEOID10_FIPS1'] = new[0]
countypairs['GEOID10_FIPS2'] = new[1]
countypairs['STATE_FIPS1'] = countypairs['GEOID10_FIPS1'].map(lambda x: x[0:2])
countypairs['STATE_FIPS2'] = countypairs['GEOID10_FIPS2'].map(lambda x: x[0:2])

countypairs = countypairs[['COUNTYPAIR_ID', 'STATE_FIPS1', 'GEOID10_FIPS1', 'STATE_FIPS2', 'GEOID10_FIPS2']]
countypairs.reset_index(drop=True, inplace=True)

### Boundaries shared by each county pairs

### Distance between population weighted county centroids and boundaries