In [None]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import geopandas as gpd
import pygris
import shapely
from shapely.geometry import Polygon
from itertools import product
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_parquet("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/FimaNfipClaims.parquet.gzip")
df_BGstate = df[['censusBlockGroupFips', 'state']].dropna().drop_duplicates()
df_BGstate['censusBlockGroupFips'] = [str(int(float(i))) for i in df_BGstate['censusBlockGroupFips']]
df_BGstate['censusBlockGroupFips'] = [censusBG.zfill(12) for censusBG in df_BGstate['censusBlockGroupFips']]

In [None]:
# Read the parquet file
df_read = pd.read_parquet("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/lat_long_geometry.parquet.gzip")

# Convert the WKT strings back to geometries
lat_long_df = gpd.GeoDataFrame(df_read, geometry=df_read['geometry'].apply(lambda x: shapely.wkt.loads(x)))

In [None]:
chunk_size = 25000  # adjust based on your system's capabilities
chunks = [x for x in range(0, 400000, chunk_size)]

gdf_list = []

for start in chunks:
    end = start + chunk_size
    temp_df = pd.read_parquet(f"C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/BG_geometry_{start}_{end}.parquet.gzip")
    gdf_read = gpd.GeoDataFrame(temp_df, geometry=temp_df['geometry'].apply(lambda x: shapely.wkt.loads(x)))
    gdf_list.append(gdf_read)

# Concatenate all GeoDataFrames in the list into a single GeoDataFrame
BG_df = pd.concat(gdf_list, ignore_index=True)

BG_df = BG_df.drop_duplicates(subset=['GEOID', 'year'])

In [None]:
# Set the index of df_BGstate to 'censusBlockGroupFips'
df_BGstate.drop_duplicates(subset='censusBlockGroupFips', inplace=True)
df_BGstate.set_index('censusBlockGroupFips', inplace=True)

# Map the 'state' values to 'GEOID' in BG_df
BG_df['state'] = BG_df['GEOID'].map(df_BGstate['state'])

# Reset the index of df_BGstate if needed
df_BGstate.reset_index(inplace=True)

In [None]:
BG_df_yearwise = {}

# Iterate through unique years and create separate DataFrames
for year in BG_df['year'].unique():
    BG_df_yearwise[year] = BG_df[BG_df['year'] == year].copy()

BG_df_2021 = BG_df_yearwise[2021]
BG_df_2012 = BG_df_yearwise[2012]
BG_df_2010 = BG_df_yearwise[2010]
BG_df_2000 = BG_df_yearwise[2000]
BG_df_1990 = BG_df_yearwise[1990]

In [None]:
# Create an empty GeoDataFrame to store the intersection results
new_unit_df_2021 = gpd.GeoDataFrame(columns=['geographic_unit_id', 'BG_id', 'latitude', 'longitude', 'geometry', 'year', 'state'])

# Iterate through each row in BG_df_2021 and each row in lat_long_df to find intersections
for idx_bg, row_bg in BG_df_2021.iterrows():
    year = row_bg['year']
    bg_id = row_bg['GEOID']
    bg_geometry = row_bg['geometry']
    state = row_bg['state']
    
    for idx_lat_long, row_lat_long in lat_long_df.iterrows():
        
        lat_long_geometry = row_lat_long['geometry']
        lat = row_lat_long['latitude']
        long = row_lat_long['longitude']
        # Compute intersection geometry

        intersection_geometry = bg_geometry.intersection(lat_long_geometry)
            
        # Check if the intersection result is valid
        if not intersection_geometry.is_empty:
            # Create a unique ID for the intersection using the year and indices
            geographic_unit_id = f"{year}_{bg_id}_{lat}_{long}"

            # Append intersection information to a list
            new_unit_df_2021 = pd.concat([new_unit_df_2021, pd.DataFrame({
                'geographic_unit_id': [geographic_unit_id],
                'BG_id': [bg_id],
                'latitude': [lat],
                'longitude': [long],
                'geometry': [intersection_geometry],
                'year': [year],
                'state': [state]
            })], ignore_index=True)

In [None]:
# Create an empty GeoDataFrame to store the intersection results
new_unit_df_2012 = gpd.GeoDataFrame(columns=['geographic_unit_id', 'BG_id', 'latitude', 'longitude', 'geometry', 'year', 'state'])

# Iterate through each row in BG_df_2012 and each row in lat_long_df to find intersections
for idx_bg, row_bg in BG_df_2012.iterrows():
    year = row_bg['year']
    bg_id = row_bg['GEOID']
    bg_geometry = row_bg['geometry']
    state = row_bg['state']
    
    for idx_lat_long, row_lat_long in lat_long_df.iterrows():
        
        lat_long_geometry = row_lat_long['geometry']
        lat = row_lat_long['latitude']
        long = row_lat_long['longitude']
        # Compute intersection geometry

        intersection_geometry = bg_geometry.intersection(lat_long_geometry)
            
        # Check if the intersection result is valid
        if not intersection_geometry.is_empty:
            # Create a unique ID for the intersection using the year and indices
            geographic_unit_id = f"{year}_{bg_id}_{lat}_{long}"

            # Append intersection information to a list
            new_unit_df_2012 = pd.concat([new_unit_df_2012, pd.DataFrame({
                'geographic_unit_id': [geographic_unit_id],
                'BG_id': [bg_id],
                'latitude': [lat],
                'longitude': [long],
                'geometry': [intersection_geometry],
                'year': [year],
                'state': [state]
            })], ignore_index=True)

In [None]:
# Create an empty GeoDataFrame to store the intersection results
new_unit_df_2010 = gpd.GeoDataFrame(columns=['geographic_unit_id', 'BG_id', 'latitude', 'longitude', 'geometry', 'year', 'state'])

# Iterate through each row in BG_df_2010 and each row in lat_long_df to find intersections
for idx_bg, row_bg in BG_df_2010.iterrows():
    year = row_bg['year']
    bg_id = row_bg['GEOID']
    bg_geometry = row_bg['geometry']
    state = row_bg['state']
    
    for idx_lat_long, row_lat_long in lat_long_df.iterrows():
        
        lat_long_geometry = row_lat_long['geometry']
        lat = row_lat_long['latitude']
        long = row_lat_long['longitude']
        # Compute intersection geometry

        intersection_geometry = bg_geometry.intersection(lat_long_geometry)
            
        # Check if the intersection result is valid
        if not intersection_geometry.is_empty:
            # Create a unique ID for the intersection using the year and indices
            geographic_unit_id = f"{year}_{bg_id}_{lat}_{long}"

            # Append intersection information to a list
            new_unit_df_2010 = pd.concat([new_unit_df_2010, pd.DataFrame({
                'geographic_unit_id': [geographic_unit_id],
                'BG_id': [bg_id],
                'latitude': [lat],
                'longitude': [long],
                'geometry': [intersection_geometry],
                'year': [year],
                'state': [state]
            })], ignore_index=True)

In [None]:
# Create an empty GeoDataFrame to store the intersection results
new_unit_df_2000 = gpd.GeoDataFrame(columns=['geographic_unit_id', 'BG_id', 'latitude', 'longitude', 'geometry', 'year', 'state'])

# Iterate through each row in BG_df_2000 and each row in lat_long_df to find intersections
for idx_bg, row_bg in BG_df_2000.iterrows():
    year = row_bg['year']
    bg_id = row_bg['GEOID']
    bg_geometry = row_bg['geometry']
    state = row_bg['state']
    
    for idx_lat_long, row_lat_long in lat_long_df.iterrows():
        
        lat_long_geometry = row_lat_long['geometry']
        lat = row_lat_long['latitude']
        long = row_lat_long['longitude']
        # Compute intersection geometry

        intersection_geometry = bg_geometry.intersection(lat_long_geometry)
            
        # Check if the intersection result is valid
        if not intersection_geometry.is_empty:
            # Create a unique ID for the intersection using the year and indices
            geographic_unit_id = f"{year}_{bg_id}_{lat}_{long}"

            # Append intersection information to a list
            new_unit_df_2000 = pd.concat([new_unit_df_2000, pd.DataFrame({
                'geographic_unit_id': [geographic_unit_id],
                'BG_id': [bg_id],
                'latitude': [lat],
                'longitude': [long],
                'geometry': [intersection_geometry],
                'year': [year],
                'state': [state]
            })], ignore_index=True)

In [None]:
# Create an empty GeoDataFrame to store the intersection results
new_unit_df_1990 = gpd.GeoDataFrame(columns=['geographic_unit_id', 'BG_id', 'latitude', 'longitude', 'geometry', 'year', 'state'])

# Iterate through each row in BG_df_1990 and each row in lat_long_df to find intersections
for idx_bg, row_bg in BG_df_1990.iterrows():
    year = row_bg['year']
    bg_id = row_bg['GEOID']
    bg_geometry = row_bg['geometry']
    state = row_bg['state']
    
    for idx_lat_long, row_lat_long in lat_long_df.iterrows():
        
        lat_long_geometry = row_lat_long['geometry']
        lat = row_lat_long['latitude']
        long = row_lat_long['longitude']
        # Compute intersection geometry

        intersection_geometry = bg_geometry.intersection(lat_long_geometry)
            
        # Check if the intersection result is valid
        if not intersection_geometry.is_empty:
            # Create a unique ID for the intersection using the year and indices
            geographic_unit_id = f"{year}_{bg_id}_{lat}_{long}"

            # Append intersection information to a list
            new_unit_df_1990 = pd.concat([new_unit_df_1990, pd.DataFrame({
                'geographic_unit_id': [geographic_unit_id],
                'BG_id': [bg_id],
                'latitude': [lat],
                'longitude': [long],
                'geometry': [intersection_geometry],
                'year': [year],
                'state': [state]
            })], ignore_index=True)

In [None]:
new_unit_df = pd.concat([new_unit_df_2021, new_unit_df_2012, new_unit_df_2010, new_unit_df_2000, new_unit_df_1990], ignore_index=True)

In [None]:
new_unit_df['geometry'] = new_unit_df['geometry'].apply(lambda geom: geom.wkt)
new_unit_df.to_parquet(f"new_unit_geometry.parquet.gzip", compression='gzip')