[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ulfboge/temporal-landcover-vectorizer/blob/main/polygon_extraction.ipynb)

# Polygon Extraction from Shapefiles

This notebook processes shapefiles and extracts polygons based on sampled coordinates. It uses GeoPandas for spatial operations and saves the results to a GeoPackage file.

In [None]:
# Import required libraries
from google.colab import drive
import pandas as pd
import geopandas as gpd
import os
from shapely.geometry import Point
import warnings
from pathlib import Path

warnings.filterwarnings('ignore')

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define input and output paths
input_directory = "/content/drive/MyDrive/earthengine/conversion/vector"  # Updated input directory
output_gpkg = f"{input_directory}/extracted_polygons.gpkg"  # Output GeoPackage

## Define Helper Functions

The following cells contain helper functions for loading coordinates and processing the data.

In [None]:
# Function to load sampled coordinates
def load_sampled_coordinates(csv_path):
    df = pd.read_csv(csv_path)
    return df[['x_coord', 'y_coord']].drop_duplicates()

# Function to create a points GeoDataFrame
def create_points_gdf(coords_df, crs):
    geometry = [Point(xy) for xy in zip(coords_df['x_coord'], coords_df['y_coord'])]
    return gpd.GeoDataFrame(coords_df, geometry=geometry, crs=crs)

In [None]:
# Function to extract intersecting polygons
def extract_polygons(points_gdf, shapefile_path, buffer_distance=0):
    polygons_gdf = gpd.read_file(shapefile_path)
    
    if points_gdf.crs != polygons_gdf.crs:
        points_gdf = points_gdf.to_crs(polygons_gdf.crs)
    
    if buffer_distance > 0:
        points_buffered = points_gdf.geometry.buffer(buffer_distance)
        intersecting_polygons = gpd.sjoin(polygons_gdf, 
                                          gpd.GeoDataFrame(geometry=points_buffered, crs=points_gdf.crs),
                                          how='inner', predicate='intersects')
    else:
        intersecting_polygons = gpd.sjoin(polygons_gdf, points_gdf, how='inner', predicate='contains')
    
    intersecting_polygons = intersecting_polygons.drop_duplicates()
    if 'index_right' in intersecting_polygons.columns:
        intersecting_polygons = intersecting_polygons.drop(columns=['index_right'])
    
    return intersecting_polygons

## Process the Data

Now we'll load the coordinates and process each shapefile.

In [None]:
# Check if input files exist
sampled_data_path = f"{input_directory}/sampled_data_with_coords.csv"
if not os.path.exists(sampled_data_path):
    raise FileNotFoundError(f"Sampled data file not found: {sampled_data_path}")

# Load coordinates
coords_df = load_sampled_coordinates(sampled_data_path)
print(f"Loaded {len(coords_df)} unique coordinate pairs")

# Create points GeoDataFrame (assuming EPSG:4326)
points_gdf = create_points_gdf(coords_df, crs="EPSG:4326")

In [None]:
# Process each shapefile
for shapefile in Path(input_directory).glob("*.shp"):
    layer_name = shapefile.stem
    print(f"Processing {layer_name}")
    
    try:
        intersecting_polygons = extract_polygons(points_gdf, shapefile)
        intersecting_polygons.to_file(output_gpkg, layer=layer_name, driver="GPKG", mode='a' if os.path.exists(output_gpkg) else 'w')
        print(f"Saved {len(intersecting_polygons)} polygons for layer {layer_name}")
    except Exception as e:
        print(f"Error processing {layer_name}: {str(e)}")

print(f"\nAll layers have been processed and saved to {output_gpkg}")