In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq
import geopandas as gpd
import seaborn as sns

#Packages
import matplotlib.ticker as mtick
from scipy import stats
pd.set_option('display.max_columns', None)

from scipy.stats import skew, kurtosis

import pygris
from shapely.geometry import Polygon

import shapely

In [2]:
df = pd.read_parquet("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/FimaNfipClaims.parquet.gzip")

In [3]:
df.shape[0]

2597460

In [4]:
df_copy = df.copy()

In [5]:
# Define bins and labels for yearOfLoss_1990_2021
bins_1980_2021 = [df_copy['yearOfLoss'].min(), 2000, 2010, 2020, df_copy['yearOfLoss'].max() + 1]
labels_1980_2021 = [1990, 2000, 2010, 2020]

df_copy['yearOfLoss_1990_2021'] = pd.cut(df_copy['yearOfLoss'], bins=bins_1980_2021, labels=labels_1980_2021, right=False).astype(int)

In [6]:
df_copy['yearOfLoss_1990_2021'].value_counts()

1990    1040310
2000     705871
2010     696576
2020     154703
Name: yearOfLoss_1990_2021, dtype: int64

# Checking for missing shapefiles after dropping NA of the mentioned geographic unit..

##### The units are: ['state',  'latitude', 'longitude', 'reportedZipCode', 'countyCode', 'censusTract', 'censusBlockGroupFips']

### 1. State

#### We are assuming that the geometries of state didn't change for our dataset (According to Wikipedia article (https://en.wikipedia.org/wiki/Territorial_evolution_of_the_United_States) all the changes to US map are either outside the mainland or too small to map. No missing shapefile for states.

### 2. Latitude/Longitude

In [None]:
test = df_copy[df_copy['latitude'].notna() & df_copy['longitude'].notna()]

In [None]:
print('Total observations available:', test.shape[0])
print('Observations lost:', df.shape[0]-test.shape[0])
print('Percentage NA:', (df.shape[0]-test.shape[0])*100/df.shape[0])
print('Total unique combinations of lat/long:', test.drop_duplicates(subset=['latitude', 'longitude']).shape[0])

In [None]:
print('Value_counts of combinations of lat/long per decade:\n', test.drop_duplicates(subset=['latitude', 'longitude', 'yearOfLoss_1990_2021'])['yearOfLoss_1990_2021'].value_counts())

#### Since latitude and longitude doesn't change over time, we are able to generate all the respective geometries`

### 3. reportedZipCode

In [7]:
test = df[df['reportedZipCode'].notna()]

In [8]:
print('Total observations available:', test.shape[0])
print('Observations lost:', df.shape[0]-test.shape[0])
print('Percentage NA:', (df.shape[0]-test.shape[0])*100/df.shape[0])
print('Total unique zipcodes:', test.drop_duplicates(subset=['reportedZipCode']).shape[0])

Total observations available: 2541633
Observations lost: 55827
Percentage NA: 2.1492920006467857
Total unique zipcodes: 25994


##### Now we can calculate for each decade how many of the zipcodes have shapefiles with geometry attribute. For zipcodes we only have shapefiles available post 2000 (inclusive) in open source.

In [9]:
# Read shapefile of zipcode

chunk_size = 50000
chunks = [x for x in range(0, 300000, chunk_size)]
gdf_list = []

for start in chunks:
    end = start + chunk_size
    temp_df = pd.read_parquet(f"C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/zipcode_geometry_{start}_{end}.parquet.gzip")
    gdf_read = gpd.GeoDataFrame(temp_df, geometry=temp_df['geometry'].apply(lambda x: shapely.wkt.loads(x)))
    gdf_list.append(gdf_read)

gdf1 = pd.concat(gdf_list, ignore_index=True)

df2 = pd.read_parquet(f"C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/zipcode_geometry_2011_union.parquet.gzip")
df2 = df2.drop(columns='flag')

gdf2 = gpd.GeoDataFrame(df2, geometry=df2['geometry'].apply(shapely.wkt.loads))

zipcode_df = pd.concat([gdf1, gdf2], ignore_index=True)
zipcode_df.rename(columns={'geometry': 'geometry_zipcode'}, inplace=True)

In [12]:
zipcode_df21 = zipcode_df
zipcode_df21['year_test'] = zipcode_df21['year'].apply(lambda x: 2000 if x == 2000 else (2010 if 2010 <= x <= 2019 else 2020))
zipcode_df21 = zipcode_df21.drop(columns=['year'])
zipcode_df3 = zipcode_df[zipcode_df['year'] == 2010]

In [15]:
zipcode_df2 = zipcode_df21.drop_duplicates(subset=['ZIPcode', 'year_test'])

In [17]:
df_geographic_unique = df[['reportedZipCode', 'yearOfLoss']].drop_duplicates()
df_geographic_unique = df_geographic_unique.dropna(subset=['reportedZipCode'])

In [18]:
df_geographic_unique['reportedZipCode'] = df_geographic_unique['reportedZipCode'].dropna().astype(int).astype(str)
df_geographic_unique['reportedZipCode'] = [zipcode.zfill(5) for zipcode in df_geographic_unique['reportedZipCode']]

In [19]:
# Define bins and labels for yearOfLoss_1990_2021
bins_1990_2021 = [df_geographic_unique['yearOfLoss'].min(), 2000, 2010, 2020, df_geographic_unique['yearOfLoss'].max() + 1]
labels_1990_2021 = [1990, 2000, 2010, 2020]

df_geographic_unique['yearOfLoss_1990_2021'] = pd.cut(df_geographic_unique['yearOfLoss'], bins=bins_1990_2021, labels=labels_1990_2021, right=False).astype(int)

df_geographic_unique = df_geographic_unique.drop(columns='yearOfLoss')

In [20]:
df_geographic_unique = df_geographic_unique.drop_duplicates()

In [21]:
# Filter for post 2000 since we don't have shapefiles pre 2000 for zipcodes
df_geographic_unique = df_geographic_unique[df_geographic_unique['yearOfLoss_1990_2021']!=1990]

In [22]:
zipcode_df2.head()

Unnamed: 0,ZIPcode,geometry_zipcode,year_test
0,99519,"POLYGON ((-148.34155 70.25088, -148.34207 70.2...",2010
9,99557,"POLYGON ((-159.72031 61.57485, -159.71881 61.5...",2020
12,99557,"POLYGON ((-160.76967 61.18977, -160.76881 61.1...",2010
21,99740,"POLYGON ((-145.55167 68.14857, -145.54375 68.1...",2010
30,99740,"POLYGON ((-145.76995 67.55872, -145.76961 67.5...",2020


In [23]:
# Initial mapping with multi-index
zipcode_df2.set_index(['ZIPcode', 'year_test'], inplace=True)
df_geographic_unique['geometry_zipcode'] = df_geographic_unique.set_index(['reportedZipCode','yearOfLoss_1990_2021']).index.map(zipcode_df2['geometry_zipcode'])

# Find rows with NaN values 
na_rows = df_geographic_unique['geometry_zipcode'].isna()

zipcode_df2.reset_index(inplace=True)

In [26]:
# Initial mapping with multi-index
zipcode_df3.set_index(['ZIPcode'], inplace=True)
df_geographic_unique['geometry_zipcode'] = df_geographic_unique.set_index(['reportedZipCode']).index.map(zipcode_df3['geometry_zipcode'])

# Find rows with NaN values 
na_rows = df_geographic_unique['geometry_zipcode'].isna()

zipcode_df3.reset_index(inplace=True)

In [27]:
test = df_geographic_unique.loc[na_rows, ]
print('Total missing:', test[['reportedZipCode']].drop_duplicates().shape[0])
print('Number of reportedZipCode per decade without a matching geometry')
print(test.drop_duplicates(subset=['reportedZipCode', 'yearOfLoss_1990_2021'])['yearOfLoss_1990_2021'].value_counts())

Total missing: 1200
Number of reportedZipCode per decade without a matching geometry
2000    892
2010    528
2020    129
Name: yearOfLoss_1990_2021, dtype: int64


In [25]:
df_test = df_copy[df_copy['reportedZipCode'].notna()]

df_test['reportedZipCode'] = df_test['reportedZipCode'].dropna().astype(int).astype(str)
df_test['reportedZipCode'] = [zipcode.zfill(5) for zipcode in df_test['reportedZipCode']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['reportedZipCode'] = df_test['reportedZipCode'].dropna().astype(int).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['reportedZipCode'] = [zipcode.zfill(5) for zipcode in df_test['reportedZipCode']]


In [48]:
print('Value_counts of reportedZipCode per decade:\n', df_test.drop_duplicates(subset=['reportedZipCode', 'yearOfLoss_1990_2021'])['yearOfLoss_1990_2021'].value_counts())

Value_counts of reportedZipCode per decade:
 1990    20965
2010    17449
2000    16263
2020     9421
Name: yearOfLoss_1990_2021, dtype: int64


In [49]:
test.set_index(['reportedZipCode', 'yearOfLoss_1990_2021'], inplace=True)
df_test.set_index(['reportedZipCode', 'yearOfLoss_1990_2021'], inplace=True)

# Select rows of df_copy that have indices found in test
matching_rows = df_test.loc[df_test.index.isin(test.index)]

matching_rows.reset_index(inplace=True)
test.reset_index(inplace=True)
df_test.reset_index(inplace=True)

print(f"There are {matching_rows.shape[0]} rows in df_copy with the same reportedZipCode and yearOfLoss_1990_2021 as in test.")

There are 6861 rows in df_copy with the same reportedZipCode and yearOfLoss_1990_2021 as in test.


In [50]:
matching_rows['yearOfLoss_1990_2021'].value_counts()

2000    3744
2010    2681
2020     436
Name: yearOfLoss_1990_2021, dtype: int64

In [22]:
df_copy[df_copy['yearOfLoss_1990_2021']==1990].shape[0]

1040310

### 4. countyCode

In [54]:
test = df[df['countyCode'].notna()]

In [55]:
print('Total observations available:', test.shape[0])
print('Observations lost:', df.shape[0]-test.shape[0])
print('Percentage NA:', (df.shape[0]-test.shape[0])*100/df.shape[0])
print('Total unique countyCode:', test.drop_duplicates(subset=['countyCode']).shape[0])

Total observations available: 2519320
Observations lost: 64922
Percentage NA: 2.5122260221759416
Total unique countyCode: 2924


##### Now we can calculate for each decade how many of the county code have shapefiles with geometry attribute.

In [61]:
# Read shapefile of county code
df_read = pd.read_parquet("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/County_geometry.parquet.gzip")

# Convert the WKT strings back to geometries
County_df = gpd.GeoDataFrame(df_read, geometry=df_read['geometry'].apply(lambda x: shapely.wkt.loads(x)))

County_df['year'] = County_df['year'].replace({2012: 2010, 2021: 2020})
County_df = County_df.drop_duplicates(subset=['CountyID', 'year'])

In [63]:
df_geographic_unique = df[['countyCode', 'yearOfLoss']].drop_duplicates()
df_geographic_unique = df_geographic_unique.dropna(subset=['countyCode'])

In [64]:
df_geographic_unique['countyCode'] = [str(int(float(i))) for i in df_geographic_unique['countyCode']]
df_geographic_unique['countyCode'] = [censuscounty.zfill(5) for censuscounty in df_geographic_unique['countyCode']]

In [65]:
# Define bins and labels for yearOfLoss_1990_2021
bins_1990_2021 = [df_geographic_unique['yearOfLoss'].min(), 2000, 2010, 2020, df_geographic_unique['yearOfLoss'].max() + 1]
labels_1990_2021 = [1990, 2000, 2010, 2020]

df_geographic_unique['yearOfLoss_1990_2021'] = pd.cut(df_geographic_unique['yearOfLoss'], bins=bins_1990_2021, labels=labels_1990_2021, right=False).astype(int)

df_geographic_unique = df_geographic_unique.drop(columns='yearOfLoss')

In [66]:
df_geographic_unique = df_geographic_unique.drop_duplicates()

In [67]:
County_df.rename(columns={'geometry': 'geometry_county'}, inplace=True)

In [33]:
# County_df.set_index(['CountyID', 'year'], inplace=True)
# df_geographic_unique['geometry_county'] = df_geographic_unique.set_index(['countyCode', 'yearOfLoss_1990_2021']).index.map(County_df['geometry_county'])

# # Find rows with NaN values
# na_rows = df_geographic_unique['geometry_county'].isna()

# County_df.reset_index(inplace=True)

In [70]:
County_df = County_df[County_df['year'] == 2020]

County_df.set_index(['CountyID'], inplace=True)
df_geographic_unique['geometry_county'] = df_geographic_unique.set_index(['countyCode']).index.map(County_df['geometry_county'])

# Find rows with NaN values
na_rows = df_geographic_unique['geometry_county'].isna()

County_df.reset_index(inplace=True)

In [71]:
test = df_geographic_unique.loc[na_rows, ]
print('Total missing:', test[['countyCode']].drop_duplicates().shape[0])
print('Number of County per decade without a matching geometry')
print(test.drop_duplicates(subset=['countyCode', 'yearOfLoss_1990_2021'])['yearOfLoss_1990_2021'].value_counts())

Total missing: 2
Number of County per decade without a matching geometry
2000    2
2010    1
1990    1
Name: yearOfLoss_1990_2021, dtype: int64


In [72]:
df_test = df_copy[df_copy['countyCode'].notna()]

df_test['countyCode'] = [str(int(float(i))) for i in df_test['countyCode']]
df_test['countyCode'] = [censuscounty.zfill(5) for censuscounty in df_test['countyCode']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['countyCode'] = [str(int(float(i))) for i in df_test['countyCode']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['countyCode'] = [censuscounty.zfill(5) for censuscounty in df_test['countyCode']]


In [73]:
print('Value_counts of county per decade:\n', df_test.drop_duplicates(subset=['countyCode', 'yearOfLoss_1990_2021'])['yearOfLoss_1990_2021'].value_counts())

Value_counts of county per decade:
 1990    2683
2010    2667
2000    2503
2020    1854
Name: yearOfLoss_1990_2021, dtype: int64


In [74]:
test.set_index(['countyCode', 'yearOfLoss_1990_2021'], inplace=True)
df_test.set_index(['countyCode', 'yearOfLoss_1990_2021'], inplace=True)

# Select rows of df_copy that have indices found in test
matching_rows = df_test.loc[df_test.index.isin(test.index)]

matching_rows.reset_index(inplace=True)
test.reset_index(inplace=True)
df_test.reset_index(inplace=True)

print(f"There are {matching_rows.shape[0]} rows in df_copy with the same countyCode and yearOfLoss_1990_2021 as in test.")

There are 10 rows in df_copy with the same countyCode and yearOfLoss_1990_2021 as in test.


In [75]:
matching_rows['yearOfLoss_1990_2021'].value_counts()

2000    6
1990    3
2010    1
Name: yearOfLoss_1990_2021, dtype: int64

### 5. censusTract

In [13]:
test = df[df['censusTract'].notna()]

In [14]:
print('Total observations available:', test.shape[0])
print('Observations lost:', df.shape[0]-test.shape[0])
print('Percentage NA:', (df.shape[0]-test.shape[0])*100/df.shape[0])
print('Total unique censusTract:', test.drop_duplicates(subset=['censusTract']).shape[0])

Total observations available: 2459669
Observations lost: 137791
Percentage NA: 5.3048362631185855
Total unique censusTract: 53468


##### Now we can calculate for each decade how many of the census tract have shapefiles with geometry attribute.

In [17]:
# Read shapefile of census tract code
chunk_size = 30000 
chunks = [x for x in range(0, 60000, chunk_size)]

gdf_list = []

for start in chunks:
    end = start + chunk_size
    temp_df = pd.read_parquet(f"C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/Tract_geometry_{start}_{end}.parquet.gzip")
    gdf_read = gpd.GeoDataFrame(temp_df, geometry=temp_df['geometry'].apply(lambda x: shapely.wkt.loads(x)))
    gdf_list.append(gdf_read)

# Concatenate all GeoDataFrames in the list into a single GeoDataFrame
Tract_df= pd.concat(gdf_list, ignore_index=True)

In [18]:
df_geographic_unique = df[['censusTract', 'yearOfLoss']].drop_duplicates()
df_geographic_unique = df_geographic_unique.dropna(subset=['censusTract'])

In [19]:
df_geographic_unique['censusTract'] = [str(int(float(i))) for i in df_geographic_unique['censusTract']]
df_geographic_unique['censusTract'] = [censusBG.zfill(11) for censusBG in df_geographic_unique['censusTract']]

In [20]:
# Define bins and labels for yearOfLoss_1990_2021
bins_1990_2021 = [df_geographic_unique['yearOfLoss'].min(), 2000, 2010, 2020, df_geographic_unique['yearOfLoss'].max() + 1]
labels_1990_2021 = [1990, 2000, 2010, 2020]

df_geographic_unique['yearOfLoss_1990_2021'] = pd.cut(df_geographic_unique['yearOfLoss'], bins=bins_1990_2021, labels=labels_1990_2021, right=False).astype(int)

df_geographic_unique = df_geographic_unique.drop(columns='yearOfLoss')

In [21]:
df_geographic_unique = df_geographic_unique.drop_duplicates()

In [22]:
Tract_df.rename(columns={'geometry': 'geometry_tract'}, inplace=True)

In [29]:
Tract_df.set_index(['censusTractID'], inplace=True)
df_geographic_unique['geometry_tract'] = df_geographic_unique.set_index(['censusTract']).index.map(Tract_df['geometry_tract'])

# Find rows with NaN values
na_rows = df_geographic_unique['geometry_tract'].isna()

Tract_df.reset_index(inplace=True)

In [30]:
test = df_geographic_unique.loc[na_rows, ]
print('Total missing:', test[['censusTract']].drop_duplicates().shape[0])
print('Number of Tract per decade without a matching geometry')
print(test.drop_duplicates(subset=['censusTract', 'yearOfLoss_1990_2021'])['yearOfLoss_1990_2021'].value_counts())

Total missing: 676
Number of Tract per decade without a matching geometry
2020    243
2000    229
2010    199
1990    162
Name: yearOfLoss_1990_2021, dtype: int64


In [31]:
df_test = df_copy[df_copy['censusTract'].notna()]

df_test['censusTract'] = [str(int(float(i))) for i in df_test['censusTract']]
df_test['censusTract'] = [censusBG.zfill(11) for censusBG in df_test['censusTract']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['censusTract'] = [str(int(float(i))) for i in df_test['censusTract']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['censusTract'] = [censusBG.zfill(11) for censusBG in df_test['censusTract']]


In [32]:
print('Value_counts of censusTract per decade:\n', df_test.drop_duplicates(subset=['censusTract', 'yearOfLoss_1990_2021'])['yearOfLoss_1990_2021'].value_counts())

Value_counts of censusTract per decade:
 1990    39777
2010    35769
2000    30830
2020    17763
Name: yearOfLoss_1990_2021, dtype: int64


In [33]:
test.set_index(['censusTract', 'yearOfLoss_1990_2021'], inplace=True)
df_test.set_index(['censusTract', 'yearOfLoss_1990_2021'], inplace=True)

# Select rows of df_copy that have indices found in test
matching_rows = df_test.loc[df_test.index.isin(test.index)]

matching_rows.reset_index(inplace=True)
test.reset_index(inplace=True)
df_test.reset_index(inplace=True)

print(f"There are {matching_rows.shape[0]} rows in df_copy with the same censusTract and yearOfLoss_1990_2021 as in test.")

There are 1726 rows in df_copy with the same censusTract and yearOfLoss_1990_2021 as in test.


In [34]:
matching_rows['yearOfLoss_1990_2021'].value_counts()

2020    505
2000    481
1990    391
2010    349
Name: yearOfLoss_1990_2021, dtype: int64

### 6. censusBlockGroupFips

In [7]:
test = df[df['censusBlockGroupFips'].notna()]

In [8]:
print('Total observations available:', test.shape[0])
print('Observations lost:', df.shape[0]-test.shape[0])
print('Percentage NA:', (df.shape[0]-test.shape[0])*100/df.shape[0])
print('Total unique censusBlockGroup:', test.drop_duplicates(subset=['censusBlockGroupFips']).shape[0])

Total observations available: 2447264
Observations lost: 136978
Percentage NA: 5.300509781978623
Total unique censusBlockGroup: 106786


##### Now we can calculate for each decade how many of the census tract have shapefiles with geometry attribute.

In [9]:
chunk_size = 40000 
chunks = [x for x in range(0, 320000, chunk_size)]

gdf_list = []

for start in chunks:
    end = start + chunk_size
    temp_df = pd.read_parquet(f"C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/BG_geometry_{start}_{end}.parquet.gzip")
    gdf_read = gpd.GeoDataFrame(temp_df, geometry=temp_df['geometry'].apply(lambda x: shapely.wkt.loads(x)))
    gdf_list.append(gdf_read)

# Concatenate all GeoDataFrames in the list into a single GeoDataFrame
BG_df= pd.concat(gdf_list, ignore_index=True)

In [10]:
df_geographic_unique = df[['censusBlockGroupFips', 'yearOfLoss']].drop_duplicates()
df_geographic_unique = df_geographic_unique.dropna(subset=['censusBlockGroupFips'])

In [11]:
df_geographic_unique['censusBlockGroupFips'] = [str(int(float(i))) for i in df_geographic_unique['censusBlockGroupFips']]
df_geographic_unique['censusBlockGroupFips'] = [censusBG.zfill(12) for censusBG in df_geographic_unique['censusBlockGroupFips']]

In [12]:
# Define bins and labels for yearOfLoss_1990_2021
bins_1990_2021 = [df_geographic_unique['yearOfLoss'].min(), 2000, 2010, 2020, df_geographic_unique['yearOfLoss'].max() + 1]
labels_1990_2021 = [1990, 2000, 2010, 2020]

df_geographic_unique['yearOfLoss_1990_2021'] = pd.cut(df_geographic_unique['yearOfLoss'], bins=bins_1990_2021, labels=labels_1990_2021, right=False).astype(int)

df_geographic_unique = df_geographic_unique.drop(columns='yearOfLoss')

In [13]:
df_geographic_unique = df_geographic_unique.drop_duplicates()

In [14]:
BG_df.rename(columns={'geometry': 'geometry_BG'}, inplace=True)

In [15]:
BG_df.set_index(['GEOID', 'year'], inplace=True)
df_geographic_unique['geometry_BG'] = df_geographic_unique.set_index(['censusBlockGroupFips', 'yearOfLoss_1990_2021']).index.map(BG_df['geometry_BG'])

# Find rows with NaN values 
na_rows = df_geographic_unique['geometry_BG'].isna()

BG_df.reset_index(inplace=True)

In [16]:
test = df_geographic_unique.loc[na_rows, ]
print('Total missing:', test[['censusBlockGroupFips']].drop_duplicates().shape[0])
print('Number of censusBlockGroupFips per decade without a matching geometry')
print(test.drop_duplicates(subset=['censusBlockGroupFips', 'yearOfLoss_1990_2021'])['yearOfLoss_1990_2021'].value_counts())

Total missing: 41487
Number of censusBlockGroupFips per decade without a matching geometry
1990    31615
2000    17045
2020     6373
2010      266
Name: yearOfLoss_1990_2021, dtype: int64


In [17]:
df_test = df_copy[df_copy['censusBlockGroupFips'].notna()]

df_test['censusBlockGroupFips'] = [str(int(float(i))) for i in df_test['censusBlockGroupFips']]
df_test['censusBlockGroupFips'] = [censusBG.zfill(12) for censusBG in df_test['censusBlockGroupFips']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['censusBlockGroupFips'] = [str(int(float(i))) for i in df_test['censusBlockGroupFips']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['censusBlockGroupFips'] = [censusBG.zfill(12) for censusBG in df_test['censusBlockGroupFips']]


In [18]:
print('Value_counts of censusBlockGroupFips per decade:\n', df_test.drop_duplicates(subset=['censusBlockGroupFips', 'yearOfLoss_1990_2021'])['yearOfLoss_1990_2021'].value_counts())

Value_counts of censusBlockGroupFips per decade:
 1990    71925
2010    61534
2000    52227
2020    24273
Name: yearOfLoss_1990_2021, dtype: int64


In [19]:
test.set_index(['censusBlockGroupFips', 'yearOfLoss_1990_2021'], inplace=True)
df_test.set_index(['censusBlockGroupFips', 'yearOfLoss_1990_2021'], inplace=True)

# Select rows of df_copy that have indices found in test
matching_rows = df_test.loc[df_test.index.isin(test.index)]

matching_rows.reset_index(inplace=True)
test.reset_index(inplace=True)
df_test.reset_index(inplace=True)

print(f"There are {matching_rows.shape[0]} rows in df_copy with the same censusBlockGroupFips and yearOfLoss_1990_2021 as in test.")

There are 682098 rows in df_copy with the same censusBlockGroupFips and yearOfLoss_1990_2021 as in test.


In [20]:
matching_rows['yearOfLoss_1990_2021'].value_counts()

1990    438104
2000    198715
2020     44855
2010       424
Name: yearOfLoss_1990_2021, dtype: int64