In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq
import geopandas as gpd
import seaborn as sns

#Packages
import matplotlib.ticker as mtick
from scipy import stats
pd.set_option('display.max_columns', None)

from scipy.stats import skew, kurtosis

import pygris
import shapely

In [2]:
df = pd.read_parquet("FimaNfipClaims.parquet.gzip")

In [3]:
df_copy = df[['state', 'reportedZipCode', 'countyCode', 'censusTract', 'censusBlockGroupFips', 'latitude', 'longitude', 'yearOfLoss']].drop_duplicates()

## Using Pygris

In [4]:
states = pygris.states()
states_list = sorted(states['STUSPS'].unique())

Using the default year of 2021


## BlockGroup Shapefile

In [5]:
BG_list_2018 = []

for i in states_list:
    BG_list_2018.append(pygris.block_groups(state = i, year = 2018))

Using FIPS code '02' for input 'AK'
Using FIPS code '01' for input 'AL'
Using FIPS code '05' for input 'AR'
Using FIPS code '60' for input 'AS'
Using FIPS code '04' for input 'AZ'
Using FIPS code '06' for input 'CA'
Using FIPS code '08' for input 'CO'
Using FIPS code '09' for input 'CT'
Using FIPS code '11' for input 'DC'
Using FIPS code '10' for input 'DE'
Using FIPS code '12' for input 'FL'
Using FIPS code '13' for input 'GA'
Using FIPS code '66' for input 'GU'
Using FIPS code '15' for input 'HI'
Using FIPS code '19' for input 'IA'
Using FIPS code '16' for input 'ID'
Using FIPS code '17' for input 'IL'
Using FIPS code '18' for input 'IN'
Using FIPS code '20' for input 'KS'
Using FIPS code '21' for input 'KY'
Using FIPS code '22' for input 'LA'
Using FIPS code '25' for input 'MA'
Using FIPS code '24' for input 'MD'
Using FIPS code '23' for input 'ME'
Using FIPS code '26' for input 'MI'
Using FIPS code '27' for input 'MN'
Using FIPS code '29' for input 'MO'
Using FIPS code '69' for inp

In [6]:
BG_list_2010 = []

for i in states_list:
    BG_list_2010.append(pygris.block_groups(state = i, year = 2010))

Using FIPS code '02' for input 'AK'
Using FIPS code '01' for input 'AL'
Using FIPS code '05' for input 'AR'
Using FIPS code '60' for input 'AS'
Using FIPS code '04' for input 'AZ'
Using FIPS code '06' for input 'CA'
Using FIPS code '08' for input 'CO'
Using FIPS code '09' for input 'CT'
Using FIPS code '11' for input 'DC'
Using FIPS code '10' for input 'DE'
Using FIPS code '12' for input 'FL'
Using FIPS code '13' for input 'GA'
Using FIPS code '66' for input 'GU'
Using FIPS code '15' for input 'HI'
Using FIPS code '19' for input 'IA'
Using FIPS code '16' for input 'ID'
Using FIPS code '17' for input 'IL'
Using FIPS code '18' for input 'IN'
Using FIPS code '20' for input 'KS'
Using FIPS code '21' for input 'KY'
Using FIPS code '22' for input 'LA'
Using FIPS code '25' for input 'MA'
Using FIPS code '24' for input 'MD'
Using FIPS code '23' for input 'ME'
Using FIPS code '26' for input 'MI'
Using FIPS code '27' for input 'MN'
Using FIPS code '29' for input 'MO'
Using FIPS code '69' for inp

In [17]:
temp2018 = pd.concat(BG_list_2018, ignore_index=True)
temp2010 = pd.concat(BG_list_2010, ignore_index=True)

In [18]:
temp2018 = temp2018[['GEOID', 'geometry']]

temp2010 = temp2010[['GEOID10', 'geometry']]

temp2010 = temp2010.rename(columns={"GEOID10": "GEOID"})

In [19]:
list_censusBG_2 = []

In [40]:
list_censusBG = temp2010['GEOID'].dropna().drop_duplicates().astype(str).tolist()

# add preceding 0 to make 4-digit zip codes into 5-digit
list_censusBG = [censusBG.zfill(12) for censusBG in list_censusBG]

In [41]:
len(list_censusBG)

220742

In [42]:
list_censusBG_2 = list_censusBG_2 + list_censusBG

In [43]:
len(list_censusBG_2)

441482

In [44]:
list_censusBG_2 = list(set(list_censusBG_2))

In [45]:
len(list_censusBG_2)

220814

In [46]:
#CensusBG list for NC

list_censusBG = df['censusBlockGroupFips'].dropna().drop_duplicates().tolist()

list_censusBG = [str(int(float(i))) for i in list_censusBG]

# add preceding 0 to make 4-digit zip codes into 5-digit
list_censusBG = [censusBG.zfill(12) for censusBG in list_censusBG]

In [47]:
# Convert both lists to sets
set_bg = set(list_censusBG)
set_bg_2 = set(list_censusBG_2)

# Find the intersection of the two sets
common_bg = set_bg.intersection(set_bg_2)

# Print the number of common zip codes
print("Number of common BG:", len(common_bg))

Number of common BG: 106275


In [48]:
len(common_bg)/len(list_censusBG)

0.9912140798567391

In [52]:
len(common_bg)

106275

In [53]:
BG_list_df2018 = temp2018[temp2018['GEOID'].isin(common_bg)]
BG_list_df2010 = temp2010[temp2010['GEOID'].isin(common_bg)]

In [54]:
BG_df = pd.concat([BG_list_df2018, BG_list_df2010], ignore_index=True)

In [55]:
BG_df2 = BG_df.drop_duplicates()

In [56]:
BG_df.shape[0] 

212497

In [57]:
BG_df2.shape[0]

212087

In [58]:
BG_df2 = BG_df2.drop_duplicates(subset=['GEOID'])

In [59]:
BG_df2.shape[0]

106275

In [60]:
chunk_size = 40000  # adjust based on your system's capabilities
chunks = [x for x in range(0, len(BG_df2), chunk_size)]

for start in chunks:
    end = start + chunk_size
    temp_df = BG_df2.iloc[start:end].copy()
    temp_df['geometry'] = temp_df['geometry'].apply(lambda geom: geom.wkt)
    temp_df.to_parquet(f"BG_geometry_{start}_{end}.parquet.gzip", compression='gzip')



## Census Tract Shapefile

In [9]:
Tracts_list_2010 = []

for i in states_list:
    Tracts_list_2010.append(pygris.tracts(state = i, year = 2010))

Using FIPS code '02' for input 'AK'
Using FIPS code '01' for input 'AL'
Using FIPS code '05' for input 'AR'
Using FIPS code '60' for input 'AS'
Using FIPS code '04' for input 'AZ'
Using FIPS code '06' for input 'CA'
Using FIPS code '08' for input 'CO'
Using FIPS code '09' for input 'CT'
Using FIPS code '11' for input 'DC'
Using FIPS code '10' for input 'DE'
Using FIPS code '12' for input 'FL'
Using FIPS code '13' for input 'GA'
Using FIPS code '66' for input 'GU'
Using FIPS code '15' for input 'HI'
Using FIPS code '19' for input 'IA'
Using FIPS code '16' for input 'ID'
Using FIPS code '17' for input 'IL'
Using FIPS code '18' for input 'IN'
Using FIPS code '20' for input 'KS'
Using FIPS code '21' for input 'KY'
Using FIPS code '22' for input 'LA'
Using FIPS code '25' for input 'MA'
Using FIPS code '24' for input 'MD'
Using FIPS code '23' for input 'ME'
Using FIPS code '26' for input 'MI'
Using FIPS code '27' for input 'MN'
Using FIPS code '29' for input 'MO'
Using FIPS code '69' for inp

In [10]:
Tracts_list_2018 = []

for i in states_list:
    Tracts_list_2018.append(pygris.tracts(state = i, year = 2018))

Using FIPS code '02' for input 'AK'
Using FIPS code '01' for input 'AL'
Using FIPS code '05' for input 'AR'
Using FIPS code '60' for input 'AS'
Using FIPS code '04' for input 'AZ'
Using FIPS code '06' for input 'CA'
Using FIPS code '08' for input 'CO'
Using FIPS code '09' for input 'CT'
Using FIPS code '11' for input 'DC'
Using FIPS code '10' for input 'DE'
Using FIPS code '12' for input 'FL'
Using FIPS code '13' for input 'GA'
Using FIPS code '66' for input 'GU'
Using FIPS code '15' for input 'HI'
Using FIPS code '19' for input 'IA'
Using FIPS code '16' for input 'ID'
Using FIPS code '17' for input 'IL'
Using FIPS code '18' for input 'IN'
Using FIPS code '20' for input 'KS'
Using FIPS code '21' for input 'KY'
Using FIPS code '22' for input 'LA'
Using FIPS code '25' for input 'MA'
Using FIPS code '24' for input 'MD'
Using FIPS code '23' for input 'ME'
Using FIPS code '26' for input 'MI'
Using FIPS code '27' for input 'MN'
Using FIPS code '29' for input 'MO'
Using FIPS code '69' for inp

In [61]:
temp2018 = pd.concat(Tracts_list_2018, ignore_index=True)
temp2010 = pd.concat(Tracts_list_2010, ignore_index=True)

In [62]:
temp2018 = temp2018[['GEOID', 'geometry']]

temp2010 = temp2010.rename(columns={"GEOID10": "GEOID"})

temp2010 = temp2010[['GEOID', 'geometry']]

In [63]:
list_censusBG_2 = []

In [96]:
list_censusBG = temp2010['GEOID'].dropna().drop_duplicates().astype(str).tolist()

# add preceding 0 to make 4-digit zip codes into 5-digit
list_censusBG = [censusBG.zfill(11) for censusBG in list_censusBG]

In [97]:
len(list_censusBG)

74133

In [98]:
list_censusBG_2 = list_censusBG_2 + list_censusBG

In [99]:
len(list_censusBG_2)

148292

In [100]:
list_censusBG_2 = list(set(list_censusBG_2))

In [101]:
len(list_censusBG_2)

74159

In [102]:
#CensusTract

list_censusBG = df['censusTract'].dropna().drop_duplicates().tolist()

list_censusBG = [str(int(float(i))) for i in list_censusBG]

# add preceding 0 to make 10-digit Tracts into 11-digit
list_censusBG = [censustract.zfill(11) for censustract in list_censusBG]

In [103]:
# Convert both lists to sets
set_bg = set(list_censusBG)
set_bg_2 = set(list_censusBG_2)

# Find the intersection of the two sets
common_bg = set_bg.intersection(set_bg_2)

# Print the number of common zip codes
print("Number of common Tracts:", len(common_bg))

Number of common Tracts: 52792


In [104]:
len(common_bg)/len(list_censusBG)

0.9873569237674871

In [105]:
len(set_bg)

53468

In [106]:
len(common_bg)

52792

In [107]:
Tract_list_df2018 = temp2018[temp2018['GEOID'].isin(common_bg)]
Tract_list_df2010 = temp2010[temp2010['GEOID'].isin(common_bg)]

In [108]:
Tract_df = pd.concat([Tract_list_df2018, Tract_list_df2010], ignore_index=True)

In [109]:
Tract_df2 = Tract_df.drop_duplicates()

In [110]:
Tract_df.shape[0] 

105556

In [111]:
Tract_df2.shape[0] 

105484

In [112]:
Tract_df2 = Tract_df2.drop_duplicates(subset=['GEOID'])

In [113]:
Tract_df2.shape[0] 

52792

In [119]:
Tract_df2 = Tract_df2.rename(columns={"GEOID": "censusTractID"})

In [120]:
chunk_size = 30000  # adjust based on your system's capabilities
chunks = [x for x in range(0, len(Tract_df2), chunk_size)]

for start in chunks:
    end = start + chunk_size
    temp_df = Tract_df2.iloc[start:end].copy()
    temp_df['geometry'] = temp_df['geometry'].apply(lambda geom: geom.wkt)
    temp_df.to_parquet(f"Tract_geometry_{start}_{end}.parquet.gzip", compression='gzip')



### County Shapefile

In [13]:
years = [2010, 2018]

yearly_data = {}

for i in years:
    yearly_data[i] = pygris.counties(year=i)

In [127]:
temp2010 = pd.DataFrame(yearly_data[2010])
temp2018 = pd.DataFrame(yearly_data[2018])

In [128]:
temp2010 = temp2010[['GEOID10', 'geometry']]
temp2010 = temp2010.rename(columns={"GEOID10": "GEOID"})

temp2018 = temp2018[['GEOID', 'geometry']]

In [129]:
list_county_3 = []

In [140]:
list_county_2 = temp2018['GEOID'].dropna().drop_duplicates().astype(str).tolist()

# add preceding 0 to make 4-digit zip codes into 5-digit
list_county_2 = [censusBG.zfill(5) for censusBG in list_county_2]

In [141]:
len(list_county_2)

3233

In [142]:
list_county_3 = list_county_3 + list_county_2

In [143]:
len(list_county_3)

6454

In [144]:
list_county_3 = list(set(list_county_3))

In [145]:
len(list_county_3)

3236

In [146]:
#County

list_county = df['countyCode'].dropna().drop_duplicates().tolist()

list_county = [str(int(float(i))) for i in list_county]

# add preceding 0 to make 4-digit zip codes into 5-digit
list_county = [county.zfill(5) for county in list_county]

In [147]:
# Convert both lists to sets
set_county = set(list_county)
set_county_2 = set(list_county_3)

# Find the intersection of the two sets
common_county = set_county.intersection(set_county_2)

# Print the number of common zip codes
print("Number of common Tract:", len(common_county))

Number of common Tract: 2924


In [148]:
len(common_county)/len(set_county)

1.0

In [149]:
len(common_county)

2924

In [150]:
County_list_df2018 = temp2018[temp2018['GEOID'].isin(common_county)]
County_list_df2010 = temp2010[temp2010['GEOID'].isin(common_county)]

In [151]:
County_df = pd.concat([County_list_df2018, County_list_df2010], ignore_index=True)

In [152]:
County_df2 = County_df.drop_duplicates()

In [153]:
County_df.shape[0] 

5842

In [154]:
County_df2.shape[0] 

5842

In [155]:
County_df2 = County_df2.drop_duplicates(subset=['GEOID'])

In [156]:
County_df2.shape[0] 

2924

In [157]:
County_df2 = County_df2.rename(columns={"GEOID": "CountyID"})

In [158]:
County_df2['geometry'] = County_df2['geometry'].apply(lambda geom: geom.wkt)

County_df2.to_parquet(f"County_geometry.parquet.gzip", compression='gzip')

### Zipcode Shapefile

In [16]:
years = [2000, 2010, 2018, 2020, 2022]

yearly_data_zip = {}

for i in years:
    yearly_data_zip[i] = pygris.zctas(year=i)

In [159]:
temp2000 = pd.DataFrame(yearly_data_zip[2000])
temp2010 = pd.DataFrame(yearly_data_zip[2010])
temp2018 = pd.DataFrame(yearly_data_zip[2018])
temp2020 = pd.DataFrame(yearly_data_zip[2020])
temp2022 = pd.DataFrame(yearly_data_zip[2022])

In [162]:
temp2022 = temp2022.rename(columns={"ZCTA5CE20": "ZIPcode"})
temp2022 = temp2022[['ZIPcode', 'geometry']]
temp2022['year'] = 2020

temp2020 = temp2020.rename(columns={"ZCTA5CE20": "ZIPcode"})
temp2020 = temp2020[['ZIPcode', 'geometry']]
temp2020['year'] = 2020

temp2018 = temp2018.rename(columns={"ZCTA5CE10": "ZIPcode"})
temp2018 = temp2018[['ZIPcode', 'geometry']]
temp2018['year'] = 2010

temp2010 = temp2010.rename(columns={"ZCTA5CE10": "ZIPcode"})
temp2010 = temp2010[['ZIPcode', 'geometry']]
temp2010['year'] = 2010

temp2000 = temp2000.rename(columns={"ZCTA5CE00": "ZIPcode"})
temp2000 = temp2000[['ZIPcode', 'geometry']]
temp2000['year'] = 2000

In [163]:
list_zipcode_3 = []

In [205]:
list_zipcode_temp = temp2000['ZIPcode'].dropna().drop_duplicates().astype(str).tolist()

# add preceding 0 to make 4-digit zip codes into 5-digit
list_zipcode_temp = [zipcode.zfill(5) for zipcode in list_zipcode_temp]

In [206]:
len(list_zipcode_temp)

32038

In [207]:
list_zipcode_3 = list_zipcode_3 + list_zipcode_temp

In [208]:
len(list_zipcode_3)

66033

In [209]:
list_zipcode_3 = list(set(list_zipcode_3))

In [210]:
len(list_zipcode_3)

34439

In [211]:
#County

list_zipcode = df['reportedZipCode'].dropna().drop_duplicates().tolist()

list_zipcode = [str(int(float(i))) for i in list_zipcode]

# add preceding 0 to make 4-digit zip codes into 5-digit
list_zipcode = [zipcode.zfill(5) for zipcode in list_zipcode]

In [212]:
set_zipcode = set(list_zipcode)
set_zipcode_3 = set(list_zipcode_3)

# Find the intersection of the two sets
common_zipcodes = set_zipcode.intersection(set_zipcode_3)

# Print the number of common zip codes
print("Number of common zip codes:", len(common_zipcodes))

Number of common zip codes: 22887


In [213]:
len(common_zipcodes)/len(list_zipcode)

0.8804724167115489

In [214]:
len(common_zipcodes)

22887

In [215]:
Zipcode_list_df2022 = temp2022[temp2022['ZIPcode'].isin(common_zipcodes)]
Zipcode_list_df2020 = temp2020[temp2020['ZIPcode'].isin(common_zipcodes)]
Zipcode_list_df2018 = temp2018[temp2018['ZIPcode'].isin(common_zipcodes)]
Zipcode_list_df2010 = temp2010[temp2010['ZIPcode'].isin(common_zipcodes)]
Zipcode_list_df2000 = temp2000[temp2000['ZIPcode'].isin(common_zipcodes)]

In [216]:
Zipcode_df = pd.concat([Zipcode_list_df2022, Zipcode_list_df2020 , Zipcode_list_df2018, Zipcode_list_df2010, Zipcode_list_df2000], ignore_index=True)

In [217]:
Zipcode_df2 = Zipcode_df.drop_duplicates()

In [218]:
Zipcode_df.shape[0] 

112329

In [219]:
Zipcode_df2.shape[0] 

94615

In [220]:
Zipcode_df2 = Zipcode_df2.drop_duplicates(subset=['ZIPcode', 'year'])

In [221]:
Zipcode_df2.shape[0] 

67147

In [222]:
chunk_size = 40000  # adjust based on your system's capabilities
chunks = [x for x in range(0, len(Zipcode_df2), chunk_size)]

for start in chunks:
    end = start + chunk_size
    temp_df = Zipcode_df2.iloc[start:end].copy()
    temp_df['geometry'] = temp_df['geometry'].apply(lambda geom: geom.wkt)
    temp_df.to_parquet(f"zipcode_geometry_{start}_{end}.parquet.gzip", compression='gzip')