# SWOT data acquisition

#### Jonas Felipe Santos de Souza (jonas.ssouza@ufpe.br)

#### Federal University of Pernambuco

#### June 10, 2025

---

## Libraries

<div class="alert alert-block alert-warning"><b>IMPORTANT:</b> check if the libraries below are installed.</div>

In [None]:
from ipyleaflet import Map, DrawControl, GeoJSON, Popup, Rectangle
from shapely.geometry import Polygon
from IPython.display import display
import matplotlib.pyplot as plt
from ipywidgets import HTML
import geopandas as gpd
from pathlib import Path
import pandas as pd
import earthaccess
import warnings
import zipfile
import time
import glob
import os

In [None]:
warnings.filterwarnings("ignore")

---

## Directories

In [None]:
# MAIN DIRECTORY
inpath = 'C:/Users/crist/Desktop/Doctorado/paper/SWOT/swot_rivers/'

# Path to save the obtained products
swotpath = f'{inpath}products/' # *.zip

# SWOT ID of river sections (*.csv file)
swot_id = f'{inpath}reachesidv17biobio.csv'

# Path to save SWOT data after extraction
swot_data = f'{inpath}River_SP_v17/reaches/'

---

## River_SP SWOT database acquisition

The database must be obtained from the *hydroweb.next* platform (https://hydroweb.next.theia-land.fr/).

<div class="alert alert-block alert-warning"><b>SKIP THIS STEP IF YOU ALREADY HAVE THE DATABASE YOU ARE INTERESTED IN.</b> </div>

In [None]:
help_message = """
Download products from your hydroweb.next projects (https://hydroweb.next.theia-land.fr) using the py-hydroweb lib (https://pypi.org/project/py-hydroweb/)
This script is an example tuned for your last hydroweb.next project but feel free to adapt it for future requests.
Follow these steps:
1. If not already done, install py-hydroweb latest version using `pip install -U py-hydroweb` (WARNING: python >= 3.8 is required)
2a. Generate an API-Key from hydroweb.next portal in your user settings
2b. Carefully store your API-Key (2 options):
- either in an environment variable `export HYDROWEB_API_KEY="<your_key_here>"`
- or in below script by replacing <your_key_here>
3. You can change download directory by adding an `output_folder` parameter when calling `submit_and_download_zip` (see below). By default, current path is used.
4. You are all set, run this script `python download_script.py`

For more documentation about how to use the py-hydroweb lib, please refer to https://pypi.org/project/py-hydroweb/.
"""

In [None]:
import logging
import sys
from datetime import datetime
from importlib.metadata import version

try:
    import py_hydroweb
except ImportError:
    print(help_message)
    exit(1)

In [None]:
# Check py-hydroweb version
latest_version = "1.0.2"
if version("py_hydroweb") < latest_version:
    logging.getLogger().warning(f"""\033[33m
/!\ Consider upgrading py-hydroweb to {latest_version} using `pip install -U py-hydroweb`
\033[0m""")

In [None]:
# Set log config
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

<div class="alert alert-block alert-warning"><b>IMPORTANT:</b> Check in your hydroweb.next account if the <b>API key</b> has been generated and is active.</div>

In [None]:
# API key
api_hydroweb = "ICx9VJ6BM672ed61TIqHXDzwwtZRecK15AnYi9VlNF0ubKn5n9"

In [None]:
# Create a client
#  - either using the API-Key environment variable (HYDROWEB_API_KEY)
#client: py_hydroweb.Client = py_hydroweb.Client("https://hydroweb.next.theia-land.fr/api")
#  - or explicitly giving API-Key (comment line above and uncomment line below)
client: py_hydroweb.Client = py_hydroweb.Client("https://hydroweb.next.theia-land.fr/api", 
                                                api_key=api_hydroweb)

In [None]:
# Initiate a new download basket (input the name you want here)
basket: py_hydroweb.DownloadBasket = py_hydroweb.DownloadBasket("my_download_basket")

In [None]:
# Add collections in our basket
# inserir no campo bbox as coordenadas da região de interesse
# "SWOT_PRIOR_RIVER_DATABASE"
# "SWOT_PRIOR_LAKE_DATABASE"
basket.add_collection("SWOT_PRIOR_RIVER_DATABASE", 
        #bbox=[-41.40, -9.60, -34.74, -7.10])
        bbox=[-74.091797, -38.894373, -70.378418, -36.300877])

In [None]:
# Do download (input the archive name you want here, and optionally an output folder)
now = datetime.today().strftime("%Y%m%dT%H%M%S")
downloaded_zip_path: str = client.submit_and_download_zip(
    basket,
    zip_filename=f"{inpath}my_hydroweb_data_{now}.zip",
    #, output_folder = "<change_me>"
)

---

## SWOT product search data configuration

In [None]:
# Shapefile with reaches river
# This file should be obtained from the SWOT database at hydroweb.next
shp = f'{inpath}SWOT_PRIOR_RIVER_DATABASE/sa_sword_reaches_hb66_v17.shp'

In [None]:
# Dados para busca dos produtos SWOT
# 'SWOT_L2_HR_LakeSP_Prior_2.0', 'SWOT_L2_HR_RiverSP_Reach_2.0', 'SWOT_L2_HR_Raster_100m_2.0'
swot_product = 'SWOT_L2_HR_RiverSP_Reach_2.0'
short_product = 'SWOT_RiverSP' # 'SWOT_LakeSP', 'SWOT_Raster', 'SWOT_RiverSP'
date_start = '2024-01-01'
date_end = '2024-12-31'
granule_product = '*'

# Plot graphs
ifplot = True # True or False

---

## Map for selecting the area of ​​interest

In [None]:
# Compatibility helper for environments without Jupyter widget support
def show_map_with_fallback(map_widget, fallback_file='swot_map_fallback.html'):
    """
    Show ipyleaflet when widgets are available and ALWAYS render
    a folium interactive fallback in the same notebook output.
    """
    try:
        display(map_widget)
        print("If the map above shows 'Error displaying widget: model not found', use the fallback map below.")
    except Exception as e:
        print(f"Interactive widget could not be displayed: {e}")

    try:
        import folium
        from IPython.display import HTML

        # Build a widget-independent interactive fallback
        fmap = folium.Map(location=list(map_widget.center), zoom_start=int(map_widget.zoom), control_scale=True)

        # Draw selected area if available
        if polygon is not None:
            folium.GeoJson(
                data={'type': 'Feature', 'geometry': polygon.__geo_interface__},
                name='selected_area',
                style_function=lambda _: {'color': '#1f77b4', 'weight': 2, 'fillOpacity': 0.1}
            ).add_to(fmap)

        # Draw filtered SWOT features if available
        if geojson_layer is not None and getattr(geojson_layer, 'data', None):
            folium.GeoJson(
                data=geojson_layer.data,
                name='filtered_features',
                style_function=lambda _: {'color': 'green', 'weight': 2, 'fillOpacity': 0.2},
                tooltip=folium.GeoJsonTooltip(fields=['reach_id'], aliases=['Reach ID'], localize=True, sticky=False)
            ).add_to(fmap)

        folium.LayerControl(collapsed=False).add_to(fmap)
        fmap.save(fallback_file)

        # Display fallback inline in the same script output
        display(HTML(fmap._repr_html_()))
        print(f"Fallback map rendered inline and saved to: {fallback_file}")
    except Exception as fallback_error:
        print(f"Fallback rendering failed: {fallback_error}")
        print("Install folium in this kernel/environment: pip install folium")


In [None]:
# Interactive map + HTML fallback for environments without widget support
show_map_with_fallback(m)


---

## Login and search for SWOT products in the *EarthData* database

You need to have an EarthData account (https://urs.earthdata.nasa.gov/).

<div class="alert alert-block alert-warning"><b>IMPORTANT:</b> Please make sure your EarthData account login and password are correct. </div>

In [None]:
polygon.bounds

In [None]:
# Verificar se o polígono foi desenhado
if 'polygon' in globals():   
    # Earthdata login
    earthaccess.login()
    
    # Buscar dados dentro dos limites do polígono
    results = earthaccess.search_data(short_name = swot_product,
                                      temporal = (date_start, date_end),
                                      #granule_name=granule_product,
                                      bounding_box=(polygon.bounds))
    
    # Exibir os granules encontrados
    items = [item['meta']['native-id'] for item in results]
    #print(f"Granules encontrados: {items}")
else:
    print("Nenhum polígono foi desenhado.")

# Display the granules found
print(len(items))
print(items)

---

## SWOT data download

In [None]:
# Function to download a file and retry if necessary
def download_file_with_retries(file_url, download_path, max_retries=3):
    attempts = 0
    success = False

    while attempts < max_retries and not success:
        try:
            earthaccess.download(file_url, download_path)
            file = max(Path(download_path).glob('SWOT*.zip'), key=os.path.getmtime)
            if file.exists() and file.stat().st_size > 0:
                success = True
                print(f"Successfully downloaded: {file}")
            else:
                raise Exception("File downloaded but appears to be incomplete.")
        except Exception as e:
            print(f"Attempt {attempts + 1} failed: {e}")
            attempts += 1
            time.sleep(5)  # Wait for 5 seconds before retrying
            # Clean up the incomplete file
    if not success:
        print(f"Failed to download file after {max_retries} attempts.")

# Set the download directory and file URL (list of URLs in this example)
file_urls = results[:500]  # Replace with your actual list of file URLs

# Download files with retries
for file_url in file_urls:
    download_file_with_retries(file_url, swotpath)

# Check the most recent file in the download directory
files = glob.glob(swotpath + 'SWOT*.zip')
try:
    file = max(files, key=os.path.getmtime)
    print(f"\nThe most recent file is: {file}")
    #assert file.exists()
except ValueError:
    print("\nNo files were downloaded.")

---

## Capturing data of interest from downloaded SWOT products

<div class="alert alert-block alert-warning"><b>IMPORTANT:</b> It will help if you have a *.csv file with data on the sections of the rivers of interest (name, reaches SWOT ID etc.).</div>

In [None]:
# *.csv file with SWOT IDs of rivers
reachid = pd.read_csv(swot_id, sep=';', decimal=',')
reachid['reach ID'] = reachid['reach ID'].astype(str)
reachid.head()

In [None]:
# Set the directory containing the zipped shapefiles
zip_dir = swotpath
output_dir = swot_data

for i in range(len(reachid.index)):
    # Define the target reach_id and other conditions
    target_reach_id = reachid['reach ID'][i]
    #target_reach_id = '62281100021'
    valid_reach_q = [0, 1]

    # Lists to store the data
    records = []

    # Loop over all ZIP files in the directory
    for filename in os.listdir(zip_dir):
        if filename.endswith(".zip") and "Reach" in filename:
            zip_path = os.path.join(zip_dir, filename)
            
            with zipfile.ZipFile(zip_path, 'r') as z:
                # Extract all the shapefile components (shp, shx, dbf, etc.)
                z.extractall("temp_shapefile")  # Temporary folder to hold unzipped files
                shapefile_path = [f for f in os.listdir("temp_shapefile") if f.endswith(".shp")][0]
                shapefile_full_path = os.path.join("temp_shapefile", shapefile_path)
                
                # Load the shapefile into a GeoDataFrame
                gdf = gpd.read_file(shapefile_full_path)

                # Mapeo case-insensitive de columnas
                cols = {c.lower(): c for c in gdf.columns}
                reach_id_col  = cols.get('reach_id')
                reach_q_col   = cols.get('reach_q')
                time_col      = cols.get('time_str') or cols.get('time')
                wse_col       = cols.get('wse')
                wse_u_col     = cols.get('wse_u')
                slope_col     = cols.get('slope')
                slope_u_col   = cols.get('slope_u')
                width_col     = cols.get('width')
                width_u_col   = cols.get('width_u')
                area_total_col= cols.get('area_total')
                area_detct_col= cols.get('area_detct')
                area_tot_u_col= cols.get('area_tot_u')
                area_det_u_col= cols.get('area_det_u')

                if not (reach_id_col and reach_q_col and time_col and wse_col):
                    # Clean up the temporary folder
                    for f in os.listdir("temp_shapefile"):
                        os.remove(os.path.join("temp_shapefile", f))
                    continue
                
                # Filter the data based on reach_id and reach_q
                filtered_gdf = gdf[(gdf[reach_id_col] == target_reach_id) & (gdf[reach_q_col].isin(valid_reach_q))].copy()
                
                # Extract variables
                if not filtered_gdf.empty:
                    filtered_gdf['Date'] = pd.to_datetime(filtered_gdf[time_col], errors='coerce')
                    filtered_gdf['wse'] = pd.to_numeric(filtered_gdf[wse_col], errors='coerce')

                    extra_map = {
                        'wse_u': wse_u_col,
                        'slope': slope_col,
                        'slope_u': slope_u_col,
                        'width': width_col,
                        'width_u': width_u_col,
                        'area_total': area_total_col,
                        'area_detct': area_detct_col,
                        'area_tot_u': area_tot_u_col,
                        'area_det_u': area_det_u_col,
                    }

                    for out_col, in_col in extra_map.items():
                        if in_col:
                            filtered_gdf[out_col] = pd.to_numeric(filtered_gdf[in_col], errors='coerce')
                        else:
                            filtered_gdf[out_col] = pd.NA

                    filtered_gdf = filtered_gdf.dropna(subset=['Date', 'wse'])

                    for row in filtered_gdf[['Date', 'wse', 'wse_u', 'slope', 'slope_u', 'width', 'width_u', 'area_total', 'area_detct', 'area_tot_u', 'area_det_u']].itertuples(index=False, name=None):
                        records.append(row)
            
                # Clean up the temporary folder
                for f in os.listdir("temp_shapefile"):
                    os.remove(os.path.join("temp_shapefile", f))
    
    # Check if data was found
    if records:
        # Sort the values by time
        records.sort(key=lambda x: x[0])
        
        # Step to save data to a CSV file
        df = pd.DataFrame(
            records,
            columns=['Date', 'wse', 'wse_u', 'slope', 'slope_u', 'width', 'width_u', 'area_total', 'area_detct', 'area_tot_u', 'area_det_u']
        )
    
        # Save DataFrame to CSV, using the reach_id as the filename
        output_csv = os.path.join(output_dir, f'{target_reach_id}_{short_product}.csv')
        df.to_csv(output_csv, sep=',', decimal='.', encoding='utf-8', index=False)
        print(f"Data saved to {output_csv}")

        if ifplot == True:
            # Plotting
            plt.figure(figsize=(10, 6))
            plt.plot(df['Date'], df['wse'], marker='o', linestyle='-')
            plt.xlabel('Date')
            plt.ylabel('WSE (m) EGM08')
            plt.title(f'WSE over Time for Reach ID: {target_reach_id}')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
    
    else:
        print(f"No data found for reach_id {target_reach_id}.")

