In [31]:
import pandas as pd
import numpy as np
import plotly.express as px
from datetime import datetime
import json
import geopandas as gpd
import folium
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot
init_notebook_mode(connected=True)
import plotly.graph_objects as go



In [32]:
from urllib.request import urlopen
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

In [33]:
zip_to_county_df = pd.read_excel('../../data/ZIP_COUNTY_122019.xlsx', dtype={'ZIP': str, 'COUNTY': str})

In [34]:
zip_to_county_df = zip_to_county_df[['ZIP', 'COUNTY']]

In [35]:
XGB_resids_ACS_df=pd.read_csv('../../data/residuals_XGB-all_minmax.csv', 
                      dtype={'zip': str})

In [36]:
XGB_resids_ACS_df['zip'] = XGB_resids_ACS_df['zip'].map(lambda x: '0' + x if len(x) < 5 else x)

In [37]:
zipcodes = XGB_resids_ACS_df['zip'].unique()

In [38]:
XGB_resids_ACS_df['residual']=XGB_resids_ACS_df['zri_predict']-XGB_resids_ACS_df['zri_actual']
XGB_resids_ACS_df['residual_squared']=XGB_resids_ACS_df['residual'].map(lambda x: x**2)
rmse_table=XGB_resids_ACS_df.groupby('zip')[['residual_squared']].agg(['sum','count'])
rmse_table['residual_avgd']=rmse_table['residual_squared']['sum']/rmse_table['residual_squared']['count']
rmse_table['residual_avgd']=rmse_table['residual_avgd'].apply(lambda x: x**0.5)
rmse_XGB_resids_ACS_df = rmse_table[['residual_avgd']].reset_index().rename(columns={'residual_avgd': "RMSE"})


In [39]:
rmse_XGB_resids_ACS_df

Unnamed: 0,zip,RMSE
,,
0,01013,41.385819
1,01020,74.062764
2,01040,81.568629
3,01085,57.763297
4,01104,66.129601
...,...,...
1296,99207,148.190618
1297,99501,50.605442
1298,99504,69.650661


In [40]:
zip_county_merged = rmse_XGB_resids_ACS_df.merge(zip_to_county_df,
                                                       how= 'left', 
                                                       left_on= "zip", 
                                                       right_on = "ZIP")


merging between different levels can give an unintended result (2 levels on the left,1 on the right)



In [41]:
zip_county_merged.rename(columns={" (RMSE, )": "RMSE"})

Unnamed: 0,"(zip, )","(RMSE, )",ZIP,COUNTY
0,01013,41.385819,01013,25013
1,01020,74.062764,01020,25013
2,01040,81.568629,01040,25013
3,01085,57.763297,01085,25013
4,01104,66.129601,01104,25013
...,...,...,...,...
1488,99207,148.190618,99207,53063
1489,99501,50.605442,99501,02020
1490,99504,69.650661,99504,02020
1491,99508,39.795373,99508,02020


In [42]:
zip_county_mean = zip_county_merged.groupby('COUNTY').mean().reset_index()

In [43]:
zip_county_mean = zip_county_mean.set_axis(['COUNTY', 'RMSE'], axis=1)

In [44]:
counties_listed = zip_county_mean['COUNTY'].unique()

In [45]:
output_dict = [x for x in counties['features'] if x['id'] in counties_listed]
counties['features'] = output_dict
counties_json = json.dumps(counties)

In [46]:
# gdp_df=pd.read_csv('../data/GDP_byZip.csv')
# taxes_df=pd.read_csv('../data/final_TAXES_by_zip.csv', dtype={'ZIPCODE': str})
shapes = gpd.read_file("../data/cb_2013_us_zcta510_500k/cb_2013_us_zcta510_500k.shp")
# # geo_shapes = json.loads("../data/cb_2013_us_zcta510_500k/cb_2013_us_zcta510_500k.json")

# with open("../data/cb_2013_us_zcta510_500k/cb_2013_us_zcta510_500k.json") as f:
#     geo_shapes = geojson.load(f)

In [47]:
shapes = shapes.loc[shapes['GEOID10'].isin(zipcodes)]

In [48]:
shapes = shapes.rename(columns={"GEOID10": "zip"})

In [49]:
shapes_cleaned = shapes.to_crs(epsg=4326)
shapes_cleaned.to_file("../data/zip_GeoJSON.json", driver = "GeoJSON")
with open("../data/zip_GeoJSON.json") as geofile:
    geoj_file = json.load(geofile)

In [60]:
# url = (
#     "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
# )
# state_geo = f"{url}/us-states.json"
# state_unemployment = f"{url}/US_Unemployment_Oct2012.csv"
# state_data = pd.read_csv(state_unemployment)

# m = folium.Map(location=[48, -102], zoom_start=3,
#                tiles = 'https://server.arcgisonline.com/ArcGIS/rest/services/World_Physical_Map/MapServer/tile/{z}/{y}/{x}',
#                attr= 'Tiles &copy; Esri &mdash; Source: US National Park Service')
m = folium.Map(width=600, height=500, location=[33.995, -118.3], zoom_start=10.25,
               tiles = "cartodbpositron")
# 'https://server.arcgisonline.com/ArcGIS/rest/services/World_Physical_Map/MapServer/tile/{z}/{y}/{x}', {
# 	attribution: 'Tiles &copy; Esri &mdash; Source: US National Park Service',
# 	maxZoom: 8

folium.Choropleth(
    geo_data=geoj_file,
    name="Zipcodes",
    data=rmse_XGB_resids_ACS_df,
    columns=["zip", "RMSE"],
    key_on="feature.properties.zip",
    fill_color="BuPu",
    bins=[0, 50, 100, 150, 200, 250, 300, 350, 400, 600],
    fill_opacity=.7,
#     line_opacity=0.00001,
    line_weight = 0.3,
    legend_name="RMSE",
    highlight=True,
).add_to(m)

# folium.TileLayer('cartodbpositron').add_to(m)

m

In [61]:
# url = (
#     "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
# )
# state_geo = f"{url}/us-states.json"
# state_unemployment = f"{url}/US_Unemployment_Oct2012.csv"
# state_data = pd.read_csv(state_unemployment)

# m = folium.Map(location=[48, -102], zoom_start=3,
#                tiles = 'https://server.arcgisonline.com/ArcGIS/rest/services/World_Physical_Map/MapServer/tile/{z}/{y}/{x}',
#                attr= 'Tiles &copy; Esri &mdash; Source: US National Park Service')

m = folium.Map(width=600, height=500, location=[34.01, -118.17], zoom_start=12.3,
               tiles = "cartodbpositron")
# 'https://server.arcgisonline.com/ArcGIS/rest/services/World_Physical_Map/MapServer/tile/{z}/{y}/{x}', {
# 	attribution: 'Tiles &copy; Esri &mdash; Source: US National Park Service',
# 	maxZoom: 8

folium.Choropleth(
    geo_data=geoj_file,
    name="Zipcodes",
    data=rmse_XGB_resids_ACS_df,
    columns=["zip", "RMSE"],
    key_on="feature.properties.zip",
    fill_color="BuPu",
    bins=[0, 50, 100, 150, 200, 250, 300, 350, 400, 600],
    fill_opacity=.7,
#     line_opacity=0.00001,
    line_weight = 0.3,
    legend_name="RMSE",
    highlight=True,
).add_to(m)

# folium.TileLayer('cartodbpositron').add_to(m)

m

In [51]:
# url = (
#     "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
# )
# state_geo = f"{url}/us-states.json"
# state_unemployment = f"{url}/US_Unemployment_Oct2012.csv"
# state_data = pd.read_csv(state_unemployment)

m = folium.Map(width=600,height=600, location=[34, -118], zoom_start=5, tiles = "cartodbpositron")

folium.Choropleth(
    geo_data=counties,
    name="Counties",
    data=zip_county_mean,
    columns=["COUNTY", "RMSE"],
    key_on="feature.id",
    fill_color="BuPu",
#     bins=[7, 15, 30, 50, 70, 90, 110, 302],
    fill_opacity=.7,
#     line_opacity=0.00001,
    line_weight = 0.3,
    legend_name="RMSE",
    highlight=True,
).add_to(m)

# folium.LayerControl().add_to(m)

m

In [52]:
with open("../../data/georef-united-states-of-america-zc-point.geojson") as geofile:
    zips_geoj_file = json.load(geofile)

In [53]:
zips_geo_df = pd.read_csv('../../data/georef-united-states-of-america-zc-point.csv',
                          sep=';', dtype={'Zip Code': str})

In [54]:
zips_geo_df['Zip Code'] = zips_geo_df['Zip Code'].map(lambda x: '0' + x if len(x) < 5 else x)

In [55]:

zips_geo_df = zips_geo_df.loc[zips_geo_df['Zip Code'].isin(zipcodes)]
zips_geo_df['Zip Code'].nunique()

1301

In [56]:
zips_geo_df

Unnamed: 0,Zip Code,Official USPS city name,Official USPS State Code,Official State Name,ZCTA,ZCTA parent,Population,Density,Primary Official County Code,Primary Official County Name,County Weights,Official County Name,Official County Code,Imprecise,Military,Timezone,Geo Point
15,32224,Jacksonville,FL,Florida,True,,44058.0,803.8,12031,Duval,"{""12031"": ""99.89"", ""12109"": ""0.11""}",Duval|St. Johns,12031|12109,False,False,America/New_York,"30.27027,-81.46796"
32,33634,Tampa,FL,Florida,True,,22719.0,1069.3,12057,Hillsborough,"{""12057"": ""100""}",Hillsborough,12057,False,False,America/New_York,"28.00884,-82.54618"
38,34698,Dunedin,FL,Florida,True,,38720.0,1463.7,12103,Pinellas,"{""12103"": ""100""}",Pinellas,12103,False,False,America/New_York,"28.03399,-82.77961"
136,43204,Columbus,OH,Ohio,True,,42796.0,1837.5,39049,Franklin,"{""39049"": ""100""}",Franklin,39049,False,False,America/New_York,"39.96136,-83.0816"
191,46203,Indianapolis,IN,Indiana,True,,38581.0,1076.1,18097,Marion,"{""18097"": ""100""}",Marion,18097,False,False,America/Indiana/Indianapolis,"39.73761,-86.0969"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33000,22204,Arlington,VA,Virginia,True,,54617.0,5117.9,51013,Arlington,"{""51013"": ""100""}",Arlington,51013,False,False,America/New_York,"38.86083,-77.09894"
33010,23602,Newport News,VA,Virginia,True,,38821.0,1117.9,51700,Newport News,"{""51700"": ""99.76"", ""51199"": ""0.24""}",Newport News|York,51700|51199,False,False,America/New_York,"37.1148,-76.51582"
33057,27705,Durham,NC,North Carolina,True,,49508.0,436.5,37063,Durham,"{""37063"": ""90.43"", ""37135"": ""9.57""}",Durham|Orange,37063|37135,False,False,America/New_York,"36.02698,-78.98065"
33098,30308,Atlanta,GA,Georgia,True,,18839.0,4561.5,13121,Fulton,"{""13121"": ""100""}",Fulton,13121,False,False,America/New_York,"33.77121,-84.3781"


In [57]:
zips_geo_df[['Latitude','Longitude']] = zips_geo_df['Geo Point'].str.split(',',expand=True)

In [58]:
output_dict = [x for x in counties['features'] if x['zip_code'] in counties_listed]
counties['features'] = output_dict
counties_json = json.dumps(counties)

KeyError: 'zip_code'

In [None]:
# fig = px.density_mapbox(zips_geo_df, lat='Latitude', lon='Longitude', z='Population', radius=4,
#                         center=dict(lat=37.0902, lon=-95.7129), zoom=2,
#                         mapbox_style="stamen-terrain")
# fig.show()

In [None]:
# fig = go.Figure(go.Densitymapbox(lat=zips_geo_df.Latitude, 
#                                  lon=zips_geo_df.Longitude, 
#                                  z=zips_geo_df.Population,
#                                  radius=3))
# fig.update_layout(mapbox_style="stamen-terrain", 
#                   mapbox_center_lat= 37.0902, 
#                   mapbox_center_lon=-95.7129)
# fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0},
#                  width=800, height=450)
# fig.update_traces(colorbar_title_text="Population", 
#                   selector=dict(type='densitymapbox'))
# fig.update_traces(reversescale=True, 
#                   selector=dict(type='densitymapbox'))
# fig.show()

In [None]:
fig = px.choropleth_mapbox(zip_county_mean, geojson=counties, locations='COUNTY', color='RMSE',
                           color_continuous_scale="Viridis",
                           range_color=(7, 110),
                           mapbox_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.8,
                           labels={'RMSE':'RMSE'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0},
                 width=800, height=450)
fig.show()



# folium.Choropleth(
#     geo_data=counties,
#     name="Counties",
#     data=zip_county_mean_test,
#     columns=["COUNTY", "1"],
#     key_on="feature.id",
#     fill_color="BuPu",
# #     bins=[0, 10, 15, 20, 25, 30, 35, 45],
#     fill_opacity=.5,
# #     line_opacity=0.00001,
#     line_weight = 0.3,
#     legend_name="Playing around",
#     highlight=True,
# ).add_to(m)

# folium.LayerControl().add_to(m)

# m

In [None]:
fig = px.choropleth_mapbox(rmse_XGB_resids_auto_only_df, geojson=geoj_file, locations='zip', color='RMSE',
                           color_continuous_scale="Viridis",
                           range_color=(5, 300),
                           mapbox_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=1,
                           labels={'RMSE':'RMSE'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0},
                 width=800, height=450)
fig.show()


In [None]:
rmse_XGB_resids_auto_only_df[rmse_XGB_resids_auto_only_df['zip'] == "33161"]

In [None]:
geoj_file

In [None]:
geoj_file

In [None]:
counties