In [None]:
"""
Script to read, process, and save ISMN soil moisture data.

This script performs the following tasks:
1. Loads raw soil moisture files from the ISMN network matching
   specific depth patterns (0–0.05 m).
2. Filters the data by date and quality flags, averaging multiple
   measurements per day if needed.
3. Stores metadata for each station, including network name, station
   name, and coordinates.
4. Saves the processed soil moisture data and metadata into a single
   pickle file for later use in analysis or visualization.

All paths, date ranges, and file name patterns are modifiable.
"""

# Author: Gerard Portal
# Date: August 29, 2025  
# Contact: gerardportal@gmail.com

In [None]:
# Import libraries

from glob import glob
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from pathlib import Path
import pickle

In [None]:
# Variable initialization

# Time period for which the SM maps will be generated (MODIFIABLE)
date_ini = datetime(2019,1,1)
date_fin = datetime(2022,12,31)

# Data folder path (MODIFIABLE)
hdd = 'E' # Drive letter (C,F,G,...)
path_dir = hdd+':/Path to the downloaded ISMN soil moisture data/'
path_is = hdd+':/Path where the resulting file containing all the data will be saved/'

In [None]:
# Retrieve the full path of files matching specific patterns
# In this case, the target is soil moisture data for depths between 0 and 0.05 meters

str_name = ['*_sm_0.05*_0.05*_*.stm','*_sm_0.000000_0.05*_*.stm'] # Name pattern (MODIFIABLE)
file_names = []
for pattern in str_name:
    file_names.extend(path_dir.rglob(pattern))

In [None]:
# Loading and processing measurements from all in situ stations

sm_insitu = np.full((len(file_names),(date_fin-date_ini).days+1),np.nan)
data = []
for i_file in range(len(file_names)):
    df = pd.read_csv(file_names[i_file], sep='\s+')
    dates_ar = df.iloc[:,0].values # Constains the dates: YYYY/MM/DD
    hour_ar = df.iloc[:,1].values # Contains the hours: HH:MM
    sm_ar = df.iloc[:,2].values # Contains the soil moisture values in m³/m³
    flag_ar = df.iloc[:,3].values # Contains the quality flags, G: Good
    info_ar = list(df.columns.values[0:8]) # Contains the name of the network, the name of the station, and the coordinates where the station is located
    tmp_name = ' '.join(df.columns.values[8:])
    info_ar.append(tmp_name)
    data.append(info_ar)

    current_date = date_ini
    while current_date<=date_fin:
        i_day = (current_date-date_ini).days
        tmp_date = current_date.strftime('%Y/%m/%d')
        tmp_pos = np.where( (dates_ar==tmp_date) & (flag_ar=='G') )[0] # Filter by date and quality flag. Time can also be specified, e.g., "...& (hour_ar=='07:00')".
        tmp = sm_ar[tmp_pos]
        # In this example, all hours are selected, and the soil moisture for the selected hours is averaged.
        if len(tmp)>0:
            sm_insitu[i_file,i_day]= np.nanmean(tmp)
        current_date += timedelta(days=1)
    processed = 100*(i_file+1)/len(file_names)
    print(f'Processed: {processed:.2f}%', end='\r')
data = np.stack(data)

In [None]:
# Save in situ station data, including metadata (network name, station name, coordinates) and soil moisture values

with open(path_is+'ISMN_SM_'+date_ini.strftime('%Y%m%d')+'_'+date_fin.strftime('%Y%m%d')+'.pckl','wb') as f:
    pickle.dump([sm_insitu,data],f)