# NDVI Vector Data Processing - Shapefile to GeoPackage Conversion

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ulfboge/temporal-landcover-vectorizer/blob/main/combine_ndvi_vectors.ipynb)

This notebook processes NDVI vector files (shapefiles) stored in Google Drive, combining data from multiple years (2013-2023) into single geopackage files per area.

## Connect to Google Drive
First, let's mount your Google Drive to access the necessary files.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Setup
Now, let's install the required packages and import libraries.

In [None]:
!pip install pandas geopandas numpy

In [None]:
import os
import glob
import pandas as pd
import geopandas as gpd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

## Define Processing Functions

In [None]:
def get_area_files(base_path, area_number):
    """Get all NDVI shapefile files for a specific area across years."""
    pattern = os.path.join(base_path, f'NDVI_JanMay_*_Area_{area_number}_polygons.shp')
    return glob.glob(pattern)

def extract_year(filename):
    """Extract year from filename."""
    parts = os.path.basename(filename).split('_')
    return next(part for part in parts if part.isdigit())

def process_area(base_path, area_number):
    """Process all files for a specific area and combine years."""
    files = get_area_files(base_path, area_number)
    if not files:
        print(f"No shapefile files found for Area {area_number}")
        return None
    
    print(f"Found {len(files)} shapefiles for Area {area_number}")
    
    # Read first file to get base structure
    base_gdf = gpd.read_file(files[0])
    year = extract_year(files[0])
    base_gdf = base_gdf.rename(columns={'y': f'y{year}'})
    print(f"Processed first file for year {year}")
    
    # Process remaining files
    for file in files[1:]:
        year = extract_year(file)
        print(f"Processing file for year {year}")
        temp_gdf = gpd.read_file(file)
        temp_gdf = temp_gdf.rename(columns={'y': f'y{year}'})
        
        # Merge based on pixel_id
        base_gdf = base_gdf.merge(
            temp_gdf[['pixel_id', f'y{year}', 'geometry']],
            on=['pixel_id', 'geometry'],
            how='outer'
        )
    
    return base_gdf

## Process All Areas

In [None]:
# Set paths
base_path = '/content/drive/MyDrive/earthengine/conversion/vector'  # Directory containing input shapefiles

print(f"\nStarting processing with base path: {base_path}")
all_files = glob.glob(os.path.join(base_path, 'NDVI_JanMay_*_Area_*_polygons.shp'))
print(f"Found {len(all_files)} input shapefiles")

if not all_files:
    print("No input shapefiles found. Please check:")
    print(f"1. The path is correct: {base_path}")
    print("2. Files follow the pattern: NDVI_JanMay_*_Area_*_polygons.shp")
    print("\nFiles in directory:")
    print(glob.glob(os.path.join(base_path, '*')))
else:
    # Get unique area numbers
    area_numbers = set()
    for file in all_files:
        parts = os.path.basename(file).split('_')
        try:
            area_idx = parts.index('Area')
            if area_idx + 1 < len(parts):
                area_num = parts[area_idx + 1]
                area_numbers.add(area_num)
        except ValueError:
            print(f"Warning: Could not process filename: {file}")
    
    print(f"\nFound {len(area_numbers)} unique areas: {sorted(area_numbers)}")
    
    # Process each area
    for area_num in sorted(area_numbers):
        print(f"\nProcessing Area {area_num}...")
        result_gdf = process_area(base_path, area_num)
        
        if result_gdf is not None:
            output_path = os.path.join(base_path, f'NDVI_Combined_Area_{area_num}.gpkg')
            try:
                result_gdf.to_file(output_path, driver='GPKG')
                print(f"Successfully saved combined data for Area {area_num} to {output_path}")
                
                # Verify the output
                if os.path.exists(output_path):
                    verify_gdf = gpd.read_file(output_path)
                    print(f"Verification - Output columns: {verify_gdf.columns.tolist()}")
                    print(f"Verification - Number of records: {len(verify_gdf)}")
            except Exception as e:
                print(f"Error saving file for Area {area_num}: {str(e)}")
        else:
            print(f"No data to save for Area {area_num}")

print("\nProcessing complete!")

## Verify Results

In [None]:
# Verification section
try:
    output_files = glob.glob(os.path.join(base_path, 'NDVI_Combined_Area_*.gpkg'))
    
    if not output_files:
        print("\nNo output geopackage files found. Please check:")
        print("1. The processing completed successfully")
        print("2. The files were saved in the correct location")
        print(f"3. You have write permissions in {base_path}")
        print("\nCurrent files in directory:")
        print(glob.glob(os.path.join(base_path, '*')))
    else:
        print(f"\nFound {len(output_files)} output geopackage files")
        # Read the first output file
        sample_file = output_files[0]
        sample_gdf = gpd.read_file(sample_file)
        print("\nOutput file structure:")
        print(sample_gdf.columns.tolist())
        print("\nFirst few rows:")
        print(sample_gdf.head())
        print("\nOutput files created:")
        for f in output_files:
            print(f"- {os.path.basename(f)}")

except Exception as e:
    print(f"\nError during verification: {str(e)}")
    print("\nDebug information:")
    print(f"Base path: {base_path}")
    print("Files in directory:")
    print(glob.glob(os.path.join(base_path, '*')))