In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [None]:
# possible further data sources
# https://data.census.gov/table?q=United+States&g=010XX00US_050XX00US53033
# https://www.census.gov/quickfacts/fact/table/kingcountywashington/POP010210
# https://catalog.data.gov/dataset/zip-codes-2259a/resource/7ebc9d84-ae79-42f5-a995-c9b06c783ff7


In [2]:
# from http://www.usa.com/rank/king-county-wa--population-density--zip-code-rank.htm?yr=9000&dis=&wist=&plow=&phigh=

df_pop_density = pd.read_csv('data/King_County_Population_Density.csv', sep = '\t')

In [3]:
df_pop_density.head()

Unnamed: 0,rank,population_density,zip/population
0,1.0,"20,629.5/sq mi",98164 / 73
1,2.0,"20,391.4/sq mi",98174 / 72
2,3.0,"19,576.7/sq mi","98121 / 12,753"
3,4.0,"17,315.9/sq mi","98101 / 10,803"
4,5.0,"15,103.1/sq mi","98104 / 13,177"


In [4]:
# split zip/population column into two columns
df_pop_density[['zip','population']]= df_pop_density["zip/population"].str.split("/", expand=True)

In [5]:
#drop combined column
df_pop_density.drop("zip/population", axis=1, inplace=True)

In [6]:
#change data type of rank column to int
df_pop_density = df_pop_density.astype({'rank': int})

In [7]:
# remove comma and unit /sq mi from population density column and turn into float
df_pop_density["population_density"] = df_pop_density.population_density.str.strip('/sq mi')

In [8]:
#remove , from string numbers and turn into float
df_pop_density["population_density"] = df_pop_density.population_density.str.replace(',', "").astype('float')

In [9]:
# zip into int
df_pop_density["zip"] = df_pop_density.zip.astype('int')

In [10]:
df_pop_density.head()

Unnamed: 0,rank,population_density,zip,population
0,1,20629.5,98164,73
1,2,20391.4,98174,72
2,3,19576.7,98121,12753
3,4,17315.9,98101,10803
4,5,15103.1,98104,13177


In [11]:
df_pop_density.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   rank                77 non-null     int64  
 1   population_density  77 non-null     float64
 2   zip                 77 non-null     int64  
 3   population          77 non-null     object 
dtypes: float64(1), int64(2), object(1)
memory usage: 2.5+ KB


In [12]:
df_pop_density_short = df_pop_density.drop(["population_density","population"], axis=1)
%store df_pop_density_short
df_pop_density_short

Stored 'df_pop_density_short' (DataFrame)


Unnamed: 0,rank,zip
0,1,98164
1,2,98174
2,3,98121
3,4,98101
4,5,98104
...,...,...
72,73,98051
73,74,98050
74,75,98045
75,76,98224


In [13]:
ranks = df_pop_density_short['rank'].tolist()
dens_zips = df_pop_density['zip'].tolist()
dict_pop_rk = dict(zip(dens_zips,ranks)) 
%store dict_pop_rk
dict_pop_rk 

Stored 'dict_pop_rk' (dict)


{98164: 1,
 98174: 2,
 98121: 3,
 98101: 4,
 98104: 5,
 98102: 6,
 98122: 7,
 98109: 8,
 98107: 9,
 98103: 10,
 98119: 11,
 98117: 12,
 98105: 13,
 98116: 14,
 98133: 15,
 98007: 16,
 98126: 17,
 98115: 18,
 98125: 19,
 98144: 20,
 98118: 21,
 98030: 22,
 98136: 23,
 98031: 24,
 98146: 25,
 98112: 26,
 98002: 27,
 98055: 28,
 98106: 29,
 98023: 30,
 98198: 31,
 98155: 32,
 98178: 33,
 98034: 34,
 98003: 35,
 98011: 36,
 98056: 37,
 98168: 38,
 98199: 39,
 98004: 40,
 98028: 41,
 98148: 42,
 98006: 43,
 98033: 44,
 98188: 45,
 98108: 46,
 98166: 47,
 98008: 48,
 98029: 49,
 98052: 50,
 98005: 51,
 98058: 52,
 98074: 53,
 98032: 54,
 98075: 55,
 98057: 56,
 98059: 57,
 98040: 58,
 98001: 59,
 98042: 60,
 98053: 61,
 98039: 62,
 98038: 63,
 98027: 64,
 98195: 65,
 98010: 66,
 98154: 67,
 98024: 68,
 98070: 69,
 98134: 70,
 98065: 71,
 98014: 72,
 98051: 73,
 98050: 74,
 98045: 75,
 98224: 76,
 98288: 77}

In [14]:
# from https://gis-kingcounty.opendata.arcgis.com/datasets/kingcounty::school-sites-in-king-county-schsite-point/explore?location=47.503391%2C-122.188658%2C10.00
df_schools = pd.read_csv('data/School_Sites_in_King_County___schsite_point.csv')
df_schools.head()

Unnamed: 0,X,Y,OBJECTID,FEATURE_ID,ESITE,CODE,FEATUREDES,NAME,ABB_NAME,ADDRESS,ZIPCODE,LONG_CEN,LAT_CEN,OSPI_CODE,SCH_CLASS,DISTRICT,PIN,MAJOR,MINOR
0,-122.190071,47.258406,1,10000066,1340665.0,660,School-Elementary,Bowman Creek Elementary School,Bowman Creek,5701 Kersey Way SE,98092,-122.190075,47.258404,0.0,10,AUBURN,3221059058,322105,9058
1,-122.176312,47.610853,2,10000076,1258412.0,660,School-Elementary,Wilburton Elementary,Wilburton,12300 Main St,98005,-122.176314,47.610851,0.0,10,BELLEVUE,3325059019,332505,9019
2,-122.299211,47.477355,3,10000077,751890.0,661,School-Junior High/Middle,Glacier Middle School,Glacier,2450 S 142nd St,98168,-122.299213,47.477353,0.0,10,HIGHLINE,3598600006,359860,6
3,-122.17058,47.604033,4,2156,83.0,664,School-Alternative,International School,International,445 128th Ave SE,98005,-122.170582,47.604031,3522.0,10,BELLEVUE,3325059117,332505,9117
4,-122.284831,47.634078,5,940,426089.0,660,School-Elementary,McGilvra Elementary School,McGilvra,1617 38th Ave E,98112,-122.284833,47.634076,2201.0,10,SEATTLE,5318100820,531810,820


In [15]:
df_schools.drop(['X','Y', 'FEATURE_ID', 'ESITE', 'CODE','ABB_NAME', 'OSPI_CODE', 'PIN', 'MAJOR', 'MINOR'], axis=1, inplace=True)

In [16]:
df_schools

Unnamed: 0,OBJECTID,FEATUREDES,NAME,ADDRESS,ZIPCODE,LONG_CEN,LAT_CEN,SCH_CLASS,DISTRICT
0,1,School-Elementary,Bowman Creek Elementary School,5701 Kersey Way SE,98092,-122.190075,47.258404,10,AUBURN
1,2,School-Elementary,Wilburton Elementary,12300 Main St,98005,-122.176314,47.610851,10,BELLEVUE
2,3,School-Junior High/Middle,Glacier Middle School,2450 S 142nd St,98168,-122.299213,47.477353,10,HIGHLINE
3,4,School-Alternative,International School,445 128th Ave SE,98005,-122.170582,47.604031,10,BELLEVUE
4,5,School-Elementary,McGilvra Elementary School,1617 38th Ave E,98112,-122.284833,47.634076,10,SEATTLE
...,...,...,...,...,...,...,...,...,...
650,651,School-Junior High/Middle,Risdon Middle School,6928 116th Ave SE,98056,-122.184431,47.540876,10,RENTON
651,652,School-Alternative,St. Joseph School,700 18th Ave E,98112,-122.308280,47.625477,20,SEATTLE
652,653,School-High,Issaquah High School,700 2nd Ave SE,98027,-122.028714,47.522362,10,ISSAQUAH
653,654,School-Elementary,Emily Dickinson Elementary School,7040 208th Ave NE,98053,-122.059926,47.669996,10,LAKE WASHINGTON


In [17]:
# Check for duplicates - no duplicates!
df_schools.duplicated().value_counts()

False    655
dtype: int64

In [18]:
df_schools.groupby('ZIPCODE').count()

Unnamed: 0_level_0,OBJECTID,FEATUREDES,NAME,ADDRESS,LONG_CEN,LAT_CEN,SCH_CLASS,DISTRICT
ZIPCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
98001,11,11,11,11,11,11,11,11
98002,16,16,16,16,16,16,16,16
98003,19,19,19,19,19,19,19,19
98004,10,10,10,10,10,10,10,10
98005,6,6,6,6,6,6,6,6
...,...,...,...,...,...,...,...,...
98188,6,6,6,6,6,6,6,6
98195,3,3,3,3,3,3,3,3
98198,11,11,11,11,11,11,11,11
98199,4,4,4,4,4,4,4,4


In [19]:
df_schools_zip = df_schools.groupby('ZIPCODE').count().reset_index()[['ZIPCODE','OBJECTID']]
df_schools_zip

Unnamed: 0,ZIPCODE,OBJECTID
0,98001,11
1,98002,16
2,98003,19
3,98004,10
4,98005,6
...,...,...
75,98188,6
76,98195,3
77,98198,11
78,98199,4


In [20]:
dict_school_zip = dict(zip(df_schools_zip['ZIPCODE'], df_schools_zip['OBJECTID']))
%store dict_school_zip
dict_school_zip

Stored 'dict_school_zip' (dict)


{98001: 11,
 98002: 16,
 98003: 19,
 98004: 10,
 98005: 6,
 98006: 12,
 98007: 11,
 98008: 12,
 98010: 2,
 98011: 14,
 98014: 5,
 98019: 3,
 98022: 9,
 98023: 12,
 98024: 2,
 98027: 8,
 98028: 6,
 98029: 5,
 98030: 8,
 98031: 10,
 98032: 7,
 98033: 18,
 98034: 16,
 98038: 9,
 98039: 3,
 98040: 12,
 98042: 14,
 98045: 4,
 98047: 1,
 98051: 1,
 98052: 16,
 98053: 10,
 98055: 3,
 98056: 6,
 98057: 7,
 98058: 12,
 98059: 10,
 98065: 8,
 98070: 5,
 98072: 8,
 98074: 9,
 98075: 7,
 98077: 4,
 98092: 17,
 98101: 1,
 98102: 5,
 98103: 16,
 98104: 1,
 98105: 6,
 98106: 6,
 98107: 4,
 98108: 9,
 98109: 6,
 98112: 9,
 98115: 20,
 98116: 9,
 98117: 8,
 98118: 14,
 98119: 5,
 98121: 4,
 98122: 14,
 98124: 1,
 98125: 12,
 98126: 6,
 98133: 13,
 98134: 3,
 98136: 1,
 98144: 12,
 98146: 11,
 98148: 8,
 98155: 12,
 98166: 6,
 98168: 11,
 98177: 2,
 98178: 7,
 98188: 6,
 98195: 3,
 98198: 11,
 98199: 4,
 98288: 1}