In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import geopandas as gpd
from scipy import stats
pd.set_option('display.max_columns', None)
from scipy.stats import skew, kurtosis
import pygris
from shapely.geometry import Polygon
import shapely
from inconsistency_utils import *

In [2]:
#Read the original dataset

df = pd.read_parquet("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/FimaNfipClaims.parquet.gzip")

## First Dataset
## No geographic values missing, no shapefiles missing

In [3]:
#dropping duplicates and NA

drop_duplicate_cols = ['state', 'reportedZipCode', 'countyCode', 'censusTract', 'censusBlockGroupFips', 'latitude', 'longitude', 'yearOfLoss']
drop_na_cols = ['latitude', 'censusBlockGroupFips', 'reportedZipCode', 'countyCode']

df_geographic_unique = process_geographic_data(df, drop_duplicate_cols, drop_na_cols)

In [4]:
df_geographic_unique.head()

Unnamed: 0,state,reportedZipCode,countyCode,censusTract,censusBlockGroupFips,latitude,longitude,yearOfLoss
0,CA,92056.0,6073.0,6073019000.0,60730190000.0,33.2,-117.3,1998
1,LA,70131.0,22071.0,22071000000.0,220710000000.0,29.9,-90.0,2005
2,FL,32566.0,12113.0,12113010000.0,121130100000.0,30.4,-86.9,1998
3,SC,29902.0,45013.0,45013000000.0,450130000000.0,32.4,-80.7,1994
4,FL,32940.0,12009.0,12009060000.0,120090600000.0,28.3,-80.7,1996


In [5]:
#check

print(sum(df_geographic_unique['state'].isna()))
print(sum(df_geographic_unique['reportedZipCode'].isna()))
print(sum(df_geographic_unique['countyCode'].isna()))
print(sum(df_geographic_unique['censusTract'].isna()))
print(sum(df_geographic_unique['censusBlockGroupFips'].isna()))
print(sum(df_geographic_unique['latitude'].isna()))
print(sum(df_geographic_unique['yearOfLoss'].isna()))

0
0
0
0
0
0
0


In [6]:
#convert units to string type to easily map

columns_to_format = {
    'reportedZipCode': 5,
    'censusBlockGroupFips': 12,
    'countyCode': 5,
    'censusTract': 11
}

df_geographic_unique = format_geographic_units(df_geographic_unique, columns_to_format)

In [7]:
# Creating year bins to map (NEED TO UPDATE IT ONCE WE HAVE 2023 ZIPCODE SHAPEFILE DATA AND THE LATEST ORIGINAL FEMA DATASET) 

bins_1980_2021 = [df['yearOfLoss'].min(), 1980, 1990, 2000, 2010, 2020, df['yearOfLoss'].max() + 1]
labels_1980_2021 = [0, 1980, 1990, 2000, 2010, 2020]

custom_bins = [0, 2000] + list(range(2010, 2023)) + [2024]
custom_labels = [0, 2000] + list(range(2010, 2023))

df_geographic_unique = process_year_of_loss(df_geographic_unique, bins_1980_2021, labels_1980_2021, custom_bins, custom_labels)

In [8]:
df_geographic_unique.head()

Unnamed: 0,state,reportedZipCode,countyCode,censusTract,censusBlockGroupFips,latitude,longitude,yearOfLoss,yearOfLoss_1980_2021,zip_year_bin
0,CA,92056,6073,6073018512,60730185123,33.2,-117.3,1998,1990,0
1,LA,70131,22071,22071000616,220710006163,29.9,-90.0,2005,2000,2000
2,FL,32566,12113,12113010815,121130108152,30.4,-86.9,1998,1990,0
3,SC,29902,45013,45013000700,450130007002,32.4,-80.7,1994,1990,0
4,FL,32940,12009,12009063107,120090631073,28.3,-80.7,1996,1990,0


## Read shapefiles

In [9]:
units = ['state', 'lat_long', 'BG', 'zipcode', 'County', 'Tract']
base_path = "C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/"
df_shapefiles = load_and_rename_geographic_data_whole(units, base_path)

state_df = df_shapefiles['state']
lat_long_df = df_shapefiles['lat_long']
BG_df = df_shapefiles['BG']
zipcode_df = df_shapefiles['zipcode']
County_df = df_shapefiles['County']
Tract_df = df_shapefiles['Tract']

Using the default year of 2021


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df.rename(columns={'geometry': 'geometry_state'}, inplace=True)


## Geometry Intersection creation

In [10]:
# Filter for post 1980

df_geographic_unique = df_geographic_unique[df_geographic_unique['yearOfLoss_1980_2021']!=0]

In [11]:
# Setting the multi-index on lat_long_df
lat_long_df.set_index(['latitude', 'longitude'], inplace=True)

# Mapping the values
df_geographic_unique['geometry_lat_long'] = df_geographic_unique.set_index(['latitude', 'longitude']).index.map(lat_long_df['geometry_lat_long'])

# Resetting the index of lat_long_df (return to multi-index)
lat_long_df.reset_index(inplace=True)

In [12]:
# Initial mapping with multi-index
BG_df.set_index(['GEOID'], inplace=True)
df_geographic_unique['geometry_BG'] = df_geographic_unique.set_index(['censusBlockGroupFips']).index.map(BG_df['geometry_BG'])

BG_df.reset_index(inplace=True)

In [13]:
# Initial mapping with multi-index
zipcode_df.set_index(['ZIPcode', 'year'], inplace=True)
df_geographic_unique['geometry_zipcode'] = df_geographic_unique.set_index(['reportedZipCode', 'zip_year_bin']).index.map(zipcode_df['geometry_zipcode'])

zipcode_df.reset_index(inplace=True)

In [14]:
# Setting the multi-index on state_df
state_df.set_index(['STUSPS'], inplace=True)

# Mapping the values
df_geographic_unique['geometry_state'] = df_geographic_unique.set_index(['state']).index.map(state_df['geometry_state'])

state_df.reset_index(inplace=True)

In [15]:
# Initial mapping with multi-index
County_df.set_index(['CountyID'], inplace=True)
df_geographic_unique['geometry_county'] = df_geographic_unique.set_index(['countyCode']).index.map(County_df['geometry_county'])

# Resetting the index of County_df (return to multi-index)
County_df.reset_index(inplace=True)

In [16]:
# Initial mapping with multi-index
Tract_df.set_index(['censusTractID'], inplace=True)
df_geographic_unique['geometry_tract'] = df_geographic_unique.set_index(['censusTract']).index.map(Tract_df['geometry_tract'])

# Resetting the index of Tract_df (return to multi-index)
Tract_df.reset_index(inplace=True)

#### Drop rows with missing shapefiles

In [17]:
df_geographic_unique = df_geographic_unique[(df_geographic_unique['geometry_BG'].notna())
                              & (df_geographic_unique['geometry_county'].notna())
                              & (df_geographic_unique['geometry_zipcode'].notna())
                              & (df_geographic_unique['geometry_tract'].notna())]

#### Creating the intersections

In [18]:
error_count = 0

# Create an empty GeoDataFrame to store the intersection results
new_unit_df = gpd.GeoDataFrame(columns=['reportedZipCode', 'countyCode', 'censusTract',
                                       'censusBlockGroupFips', 'latitude', 'longitude', 'year', 'year_zipcode', 'state', 'geometry_zipcode',
                                       'geometry_county', 'geometry_tract','geometry_BG','geometry_lat_long','geometry_state',
                                         'cbgInconsistent', 'tractInconsistent', 'countyInconsistent', 'stateInconsistent', 'latlongInconsistent', 'multiple', 'noOverlap', 'oneWrong'])

# Iterate through each row in df_geographic_unique to find intersections
for idx_unit, row_unit in df_geographic_unique.iterrows():
    year = row_unit['yearOfLoss_1980_2021']
    year_zipcode = row_unit['zip_year_bin']
    bg_id = row_unit['censusBlockGroupFips']
    bg_geometry = row_unit['geometry_BG']
    tract_id = row_unit['censusTract']
    tract_geometry = row_unit['geometry_tract']
    county_id = row_unit['countyCode']
    county_geometry = row_unit['geometry_county']
    state = row_unit['state']
    state_geometry = row_unit['geometry_state']
    lat_long_geometry = row_unit['geometry_lat_long']
    lat = row_unit['latitude']
    long = row_unit['longitude']
    zipcode_geometry = row_unit['geometry_zipcode']
    zipcode = row_unit['reportedZipCode']
    
    # Compute intersection geometry
    #intersection_geometry = bg_geometry.intersection(lat_long_geometry).intersection(zipcode_geometry).intersection(county_geometry).intersection(state_geometry).intersection(tract_geometry)
    
    # First intersection
    intersection_1 = bg_geometry.intersection(lat_long_geometry)

    # Second intersection
    intersection_2 = intersection_1.intersection(zipcode_geometry)

    # Third intersection
    
    try:
        intersection_3 = intersection_2.intersection(county_geometry)
    except Exception as e:
        if "TopologyException" in str(e):
            error_count += 1
            # Apply a small buffer (e.g., 0.0001) to the geometry and retry
            buffered_geom = intersection_2.buffer(0.0001)
            intersection_3 = buffered_geom.intersection(county_geometry)
    
    
    # Fourth intersection
    intersection_4 = intersection_3.intersection(state_geometry)

    try:
        intersection_geometry = intersection_4.intersection(tract_geometry)
    except Exception as e:
        if "TopologyException" in str(e):
            error_count += 1
            # Apply a small buffer (e.g., 0.0001) to the geometry and retry
            buffered_geom = intersection_4.buffer(0.0001)
            intersection_geometry = buffered_geom.intersection(tract_geometry)

    if intersection_geometry is None or intersection_geometry.is_empty:
        new_unit_df = pd.concat([new_unit_df, pd.DataFrame({
            'reportedZipCode': [zipcode],
            'countyCode': [county_id],
            'censusTract': [tract_id],
            'censusBlockGroupFips': [bg_id],
            'latitude': [lat],
            'longitude': [long],
            'year': [year],
            'year_zipcode': [year_zipcode],
            'state': [state],
            'geometry_zipcode': [zipcode_geometry],
            'geometry_county': [county_geometry],
            'geometry_tract': [tract_geometry],
            'geometry_BG': [bg_geometry],
            'geometry_lat_long': [lat_long_geometry],
            'geometry_state': [state_geometry]
        })], ignore_index=True)

  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **

In [19]:
print(len(new_unit_df))
print(error_count)

6761
7


In [20]:
for idx_unit, row_unit in new_unit_df.iterrows():
    year = row_unit['year']
    year_zipcode = row_unit['year_zipcode']
    bg_id = row_unit['censusBlockGroupFips']
    bg_geometry = row_unit['geometry_BG']
    tract_id = row_unit['censusTract']
    tract_geometry = row_unit['geometry_tract']
    county_id = row_unit['countyCode']
    county_geometry = row_unit['geometry_county']
    state = row_unit['state']
    state_geometry = row_unit['geometry_state']
    lat_long_geometry = row_unit['geometry_lat_long']
    lat = row_unit['latitude']
    long = row_unit['longitude']
    zipcode_geometry = row_unit['geometry_zipcode']
    zipcode = row_unit['reportedZipCode']
    

    intersection_geometry = lat_long_geometry.intersection(zipcode_geometry).intersection(county_geometry).intersection(state_geometry).intersection(tract_geometry)

    if intersection_geometry is None or intersection_geometry.is_empty:
        new_unit_df.at[idx_unit, 'cbgInconsistent'] = 0
    else:
        new_unit_df.at[idx_unit, 'cbgInconsistent'] = 1

  return lib.intersection(a, b, **kwargs)


In [21]:
new_unit_df['cbgInconsistent'].sum()

705

In [22]:
for idx_unit, row_unit in new_unit_df.iterrows():
    year = row_unit['year']
    year_zipcode = row_unit['year_zipcode']
    bg_id = row_unit['censusBlockGroupFips']
    bg_geometry = row_unit['geometry_BG']
    tract_id = row_unit['censusTract']
    tract_geometry = row_unit['geometry_tract']
    county_id = row_unit['countyCode']
    county_geometry = row_unit['geometry_county']
    state = row_unit['state']
    state_geometry = row_unit['geometry_state']
    lat_long_geometry = row_unit['geometry_lat_long']
    lat = row_unit['latitude']
    long = row_unit['longitude']
    zipcode_geometry = row_unit['geometry_zipcode']
    zipcode = row_unit['reportedZipCode']
    

    intersection_geometry = bg_geometry.intersection(zipcode_geometry).intersection(county_geometry).intersection(state_geometry).intersection(lat_long_geometry)

    if intersection_geometry is None or intersection_geometry.is_empty:
        new_unit_df.at[idx_unit,'tractInconsistent'] = 0
    else:
        new_unit_df.at[idx_unit,'tractInconsistent'] = 1

In [23]:
new_unit_df['tractInconsistent'].sum()

5

In [24]:
for idx_unit, row_unit in new_unit_df.iterrows():
    year = row_unit['year']
    bg_id = row_unit['censusBlockGroupFips']
    bg_geometry = row_unit['geometry_BG']
    tract_id = row_unit['censusTract']
    tract_geometry = row_unit['geometry_tract']
    county_id = row_unit['countyCode']
    county_geometry = row_unit['geometry_county']
    state = row_unit['state']
    state_geometry = row_unit['geometry_state']
    lat_long_geometry = row_unit['geometry_lat_long']
    lat = row_unit['latitude']
    long = row_unit['longitude']
    zipcode_geometry = row_unit['geometry_zipcode']
    zipcode = row_unit['reportedZipCode']
    

    intersection_geometry = bg_geometry.intersection(zipcode_geometry).intersection(tract_geometry).intersection(state_geometry).intersection(lat_long_geometry)

    if intersection_geometry is None or intersection_geometry.is_empty:
        new_unit_df.at[idx_unit,'countyInconsistent'] = 0
    else:
        new_unit_df.at[idx_unit,'countyInconsistent'] = 1

In [25]:
new_unit_df['countyInconsistent'].sum()

3

In [26]:
for idx_unit, row_unit in new_unit_df.iterrows():
    year = row_unit['year']
    bg_id = row_unit['censusBlockGroupFips']
    bg_geometry = row_unit['geometry_BG']
    tract_id = row_unit['censusTract']
    tract_geometry = row_unit['geometry_tract']
    county_id = row_unit['countyCode']
    county_geometry = row_unit['geometry_county']
    state = row_unit['state']
    state_geometry = row_unit['geometry_state']
    lat_long_geometry = row_unit['geometry_lat_long']
    lat = row_unit['latitude']
    long = row_unit['longitude']
    zipcode_geometry = row_unit['geometry_zipcode']
    zipcode = row_unit['reportedZipCode']
    

    intersection_geometry = bg_geometry.intersection(zipcode_geometry).intersection(tract_geometry).intersection(county_geometry).intersection(lat_long_geometry)

    if intersection_geometry is None or intersection_geometry.is_empty:
        new_unit_df.at[idx_unit,'stateInconsistent'] = 0
    else:
        new_unit_df.at[idx_unit,'stateInconsistent'] = 1

In [27]:
new_unit_df['stateInconsistent'].sum()

12

In [28]:
for idx_unit, row_unit in new_unit_df.iterrows():
    year = row_unit['year']
    bg_id = row_unit['censusBlockGroupFips']
    bg_geometry = row_unit['geometry_BG']
    tract_id = row_unit['censusTract']
    tract_geometry = row_unit['geometry_tract']
    county_id = row_unit['countyCode']
    county_geometry = row_unit['geometry_county']
    state = row_unit['state']
    state_geometry = row_unit['geometry_state']
    lat_long_geometry = row_unit['geometry_lat_long']
    lat = row_unit['latitude']
    long = row_unit['longitude']
    zipcode_geometry = row_unit['geometry_zipcode']
    zipcode = row_unit['reportedZipCode']
    

    intersection_geometry = bg_geometry.intersection(zipcode_geometry).intersection(tract_geometry).intersection(county_geometry).intersection(state_geometry)

    if intersection_geometry is None or intersection_geometry.is_empty:
        new_unit_df.at[idx_unit,'latlongInconsistent'] = 0
    else:
        new_unit_df.at[idx_unit,'latlongInconsistent'] = 1

In [29]:
new_unit_df['latlongInconsistent'].sum()

620

In [30]:
for idx_unit, row_unit in new_unit_df.iterrows():
    year = row_unit['year']
    bg_id = row_unit['censusBlockGroupFips']
    bg_geometry = row_unit['geometry_BG']
    tract_id = row_unit['censusTract']
    tract_geometry = row_unit['geometry_tract']
    county_id = row_unit['countyCode']
    county_geometry = row_unit['geometry_county']
    state = row_unit['state']
    state_geometry = row_unit['geometry_state']
    lat_long_geometry = row_unit['geometry_lat_long']
    lat = row_unit['latitude']
    long = row_unit['longitude']
    zipcode_geometry = row_unit['geometry_zipcode']
    zipcode = row_unit['reportedZipCode']
    

    intersection_geometry = bg_geometry.intersection(lat_long_geometry).intersection(tract_geometry).intersection(county_geometry).intersection(state_geometry)

    if intersection_geometry is None or intersection_geometry.is_empty:
        new_unit_df.at[idx_unit,'zipInconsistent'] = 0
    else:
        new_unit_df.at[idx_unit,'zipInconsistent'] = 1

  return lib.intersection(a, b, **kwargs)


In [31]:
print(new_unit_df['zipInconsistent'].sum())

6656.0


In [32]:
mask = (new_unit_df['latlongInconsistent'] +  new_unit_df['zipInconsistent']+ new_unit_df['stateInconsistent'] + new_unit_df['countyInconsistent'] + new_unit_df['tractInconsistent'] + new_unit_df['cbgInconsistent']) > 1
new_unit_df.loc[mask, 'multiple'] = 1
new_unit_df.loc[~mask, 'multiple'] = 0

print(sum(new_unit_df['multiple']))

1189


In [33]:
new_unit_df.head()

Unnamed: 0,reportedZipCode,countyCode,censusTract,censusBlockGroupFips,latitude,longitude,year,year_zipcode,state,geometry_zipcode,geometry_county,geometry_tract,geometry_BG,geometry_lat_long,geometry_state,cbgInconsistent,tractInconsistent,countyInconsistent,stateInconsistent,latlongInconsistent,multiple,noOverlap,oneWrong,zipInconsistent
0,28646,37011,37011930200,370119302003,36.0,-81.9,2000,2000,NC,"MULTIPOLYGON (((-81.856157 36.087607, -81.8561...","POLYGON ((-82.016572 36.145136, -82.016446 36....","POLYGON ((-81.920901 36.06335, -81.92059499999...","POLYGON ((-81.884216 36.050656, -81.884332 36....","POLYGON ((-81.95 35.95, -81.85000000000001 35....","MULTIPOLYGON (((-75.726807 35.935844, -75.7182...",1,0,0,0,1,1,,,1.0
1,77550,48167,48167726100,481677261001,29.2,-95.0,2000,2000,TX,"POLYGON ((-94.81155299999999 29.276833, -94.81...","POLYGON ((-95.11995499999999 29.29611, -95.125...","POLYGON ((-95.12993499999999 29.085618, -95.12...","POLYGON ((-95.021345 29.234171, -95.0212009999...","POLYGON ((-95.05 29.15, -94.95 29.15, -94.95 2...","MULTIPOLYGON (((-94.718296 29.728855, -94.7172...",0,0,0,0,0,0,,,1.0
2,77553,48167,48167723900,481677239002,29.4,-94.7,2000,2000,TX,"POLYGON ((-94.802748 29.348501, -94.801019 29....","POLYGON ((-95.11995499999999 29.29611, -95.125...","POLYGON ((-94.569902 29.573206, -94.569824 29....","POLYGON ((-94.78513799999999 29.357124, -94.78...","POLYGON ((-94.75 29.349999999999998, -94.65 29...","MULTIPOLYGON (((-94.718296 29.728855, -94.7172...",0,0,0,0,0,0,,,1.0
3,32503,12033,12033002500,120330025001,30.3,-87.1,2000,2000,FL,"POLYGON ((-87.199598 30.451037, -87.199513 30....","POLYGON ((-87.440343 30.690352999999998, -87.4...","POLYGON ((-86.919135 30.367646999999998, -86.9...","POLYGON ((-87.19211299999999 30.324032, -87.19...","POLYGON ((-87.14999999999999 30.25, -87.05 30....","MULTIPOLYGON (((-80.627171 25.000401, -80.6260...",0,0,0,0,0,0,,,1.0
4,32503,12033,12033002500,120330025001,30.3,-87.1,2000,2000,FL,"POLYGON ((-87.199598 30.451037, -87.199513 30....","POLYGON ((-87.440343 30.690352999999998, -87.4...","POLYGON ((-86.919135 30.367646999999998, -86.9...","POLYGON ((-87.19211299999999 30.324032, -87.19...","POLYGON ((-87.14999999999999 30.25, -87.05 30....","MULTIPOLYGON (((-80.627171 25.000401, -80.6260...",0,0,0,0,0,0,,,1.0


In [34]:
condition = (new_unit_df['latlongInconsistent'] + 
             new_unit_df['stateInconsistent'] + 
             new_unit_df['countyInconsistent'] + 
             new_unit_df['tractInconsistent'] + 
             new_unit_df['cbgInconsistent'] +
             new_unit_df['zipInconsistent']) == 0

new_unit_df['noOverlap'] = 0  # Default to 0
new_unit_df.loc[condition, 'noOverlap'] = 1

print(sum(new_unit_df['noOverlap']))

13


In [35]:
condition = (new_unit_df['latlongInconsistent'] + 
             new_unit_df['stateInconsistent'] + 
             new_unit_df['countyInconsistent'] + 
             new_unit_df['tractInconsistent'] + 
             new_unit_df['cbgInconsistent'] +
             new_unit_df['zipInconsistent']) == 1

new_unit_df['oneWrong'] = 0  # Default to 0
new_unit_df.loc[condition, 'oneWrong'] = 1

print(sum(new_unit_df['oneWrong']))


5559


In [36]:
# Initialize the count dictionary for 'Wrong' values
wrong_counts = {
    'lat_long_geometry': 0,
    'state_geometry': 0,
    'county_geometry': 0,
    'tract_geometry': 0,
    'bg_geometry': 0,
    'zip_geometry': 0  # added for zip codes
}

# Iterate through the DataFrame
for idx_unit, row_unit in new_unit_df.iterrows():
    if row_unit['oneWrong'] == 1:
        if row_unit['latlongInconsistent'] == 1:
            new_unit_df.at[idx_unit, 'lat_long_geometry'] = 'Wrong'
            wrong_counts['lat_long_geometry'] += 1
        if row_unit['stateInconsistent'] == 1:
            new_unit_df.at[idx_unit, 'state_geometry'] = 'Wrong'
            wrong_counts['state_geometry'] += 1
        if row_unit['countyInconsistent'] == 1:
            new_unit_df.at[idx_unit, 'county_geometry'] = 'Wrong'
            wrong_counts['county_geometry'] += 1
        if row_unit['tractInconsistent'] == 1:
            new_unit_df.at[idx_unit, 'tract_geometry'] = 'Wrong'
            wrong_counts['tract_geometry'] += 1
        if row_unit['cbgInconsistent'] == 1:
            new_unit_df.at[idx_unit, 'bg_geometry'] = 'Wrong'
            wrong_counts['bg_geometry'] += 1
        if row_unit['zipInconsistent'] == 1:  # added for zip codes
            new_unit_df.at[idx_unit, 'zip_geometry'] = 'Wrong'
            wrong_counts['zip_geometry'] += 1

# Print the 'Wrong' counts
for category, count in wrong_counts.items():
    print(f"{category}: {count} 'Wrong' values")


lat_long_geometry: 23 'Wrong' values
state_geometry: 0 'Wrong' values
county_geometry: 0 'Wrong' values
tract_geometry: 0 'Wrong' values
bg_geometry: 24 'Wrong' values
zip_geometry: 5512 'Wrong' values


In [37]:
new_unit_df['CBG_Zip_Mutually_Inconsistent'] = np.where(
    (new_unit_df['latlongInconsistent'] == 0) & 
    (new_unit_df['cbgInconsistent'] == 1) & 
    (new_unit_df['zipInconsistent'] == 1), 1, 0)

In [38]:
sum(new_unit_df['CBG_Zip_Mutually_Inconsistent'] == 1)

586

In [39]:
len(new_unit_df[(new_unit_df['latlongInconsistent'] == 1) &(new_unit_df['cbgInconsistent'] == 1) & ((new_unit_df['latlongInconsistent'] + 
             new_unit_df['stateInconsistent'] + 
             new_unit_df['countyInconsistent'] + 
             new_unit_df['tractInconsistent'] + 
             new_unit_df['cbgInconsistent'] +
             new_unit_df['zipInconsistent'])  == 2)])

44

In [40]:
sum(new_unit_df['latlongInconsistent'] == 1)

620

In [41]:
columns_to_drop = [
    'geometry_tract', 'geometry_BG', 'geometry_zipcode',
    'geometry_county', 'geometry_state', 'geometry_lat_long'
]

# Drop the columns
new_unit_df = new_unit_df.drop(columns= columns_to_drop)

In [42]:
# Convert 'zipInconsistent' column to integer type.
new_unit_df['zipInconsistent'] = new_unit_df['zipInconsistent'].astype(int)

new_unit_df= new_unit_df.drop_duplicates()

new_unit_df.to_parquet('C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/inconsistency_dataframe_1.parquet.gzip', compression = 'gzip',  index=False)