In [None]:
"""
    This notebook extracts IMERG Timeseries Precipitations for Multi-stations. 
    
    contact
    ----------
    Dr. KENNETH EKPETERE | kenneth.ekpetere@gmail.com

    """

### **Extract IMERG Precipitation for all Station points/pixels**

In [1]:
import geemap
import ee
import pandas as pd
import time
import os
import csv
# import time as tm
# from datetime import timedelta, datetime

In [3]:
# # Authenticate Earth Engine.
# ee.Authenticate()

In [4]:
# Initialize the Earth Engine module.
ee.Initialize()

In [9]:

# Function to extract IMERG time series for a given station/pixel
def extract_imerg_time_series(lat, lon):
    # IMERG dataset
    # dataset = ee.ImageCollection('NASA/GPM_L3/IMERG_V06').select('precipitationCal') # v06 (2000 - 2023)

    # v07 => https://developers.google.com/earth-engine/datasets/catalog/NASA_GPM_L3_IMERG_V07#bands
    dataset = ee.ImageCollection('NASA/GPM_L3/IMERG_V07').select('precipitation') # v07 (2000 - 2024) 
    

    # Define point of interest from stn list
    point = ee.Geometry.Point(lon, lat)

    # Initialize empty list to store yearly dataframes
    yearly_dfs = []

    # Define start and end dates for yearly chunks
    start_date = ee.Date('2000-01-01')
    end_date = ee.Date('2024-12-31')  # increased year from 2023 to 2024 for v07

    # Iterate over years and extract data in yearly chunks
    year = start_date.get('year')
    while year.getInfo() <= end_date.get('year').getInfo():
        # Define current year's date range
        start_year = ee.Date.fromYMD(year, 1, 1)
        end_year = ee.Date.fromYMD(year, 12, 31)
        end_year = end_year.advance(1, 'day')

        # Filter dataset by current year
        filtered = dataset.filterDate(start_year, end_year)

        # Extract time-series at the point
        ts = filtered.getRegion(point, scale=11132).getInfo()

        # Convert to DataFrame
        df = pd.DataFrame(ts[1:], columns=ts[0])
        # df = df[['time', 'precipitationCal']] # uncomment to run v06
        df = df[['time', 'precipitation']] # comment to run v07
        df['time'] = pd.to_datetime(df['time'], unit='ms')

        # Append yearly DataFrame to list
        yearly_dfs.append(df)

        # Move to the next year
        year = ee.Number(year).add(1)

    # Concatenate all yearly dataframes into one
    combined_df = pd.concat(yearly_dfs, ignore_index=True)

    return combined_df

# Read input CSV file
# input_file = 'stn.csv'  # full stations (2360 - stations)
input_file = 'stn_tt.csv'     # test stations
output_folder = 'output_filesv7_test/'

data = pd.read_csv(input_file)

# Process each row in the CSV
for index, row in data.iterrows():
    unique_id = str(int(row['ID']))  # Convert ID to string

    try:
        lat = row['Lat']
        lon = row['Lon']

        # Extract IMERG time series
        ts_df = extract_imerg_time_series(lat, lon)

        # Save output to CSV
        filename = f"ts_{unique_id}_{lat}_{lon}.csv"
        output_path = os.path.join(output_folder, filename)
        ts_df.to_csv(output_path, index=False)

        print(f"Processed ID: {unique_id}. Saved to {output_path}")

    except Exception as e:
        print(f"Error processing ID {unique_id}: {str(e)}")

    # Pause for 5 seconds to prevent memory issues and respect GEE limitations
    time.sleep(5)

print("All IDs processed.")


Processed ID: 238003. Saved to output_filesv7B/ts_238003_40.22_-94.5444.csv
Processed ID: 237967. Saved to output_filesv7B/ts_237967_36.9839_-94.5356.csv
Processed ID: 221962. Saved to output_filesv7B/ts_221962_34.9175_-88.5228.csv
Processed ID: 215298. Saved to output_filesv7B/ts_215298_46.9833_-92.7333.csv
Processed ID: 135796. Saved to output_filesv7B/ts_135796_40.9486_-91.5647.csv
Processed ID: 130200. Saved to output_filesv7B/ts_130200_42.0208_-93.7742.csv
Processed ID: 5670. Saved to output_filesv7B/ts_5670_33.965_-112.4286.csv
All IDs processed.


#### **Retrieve December 31 Alone**

In [11]:

# Function to extract IMERG time series for a given station/pixel (dec 31 of every year alone)
def extract_imerg_time_series(lat, lon):
    # IMERG dataset
    # dataset = ee.ImageCollection('NASA/GPM_L3/IMERG_V06').select('precipitationCal') # v06 (2000 - 2023)

    # v07 => https://developers.google.com/earth-engine/datasets/catalog/NASA_GPM_L3_IMERG_V07#bands
    dataset = ee.ImageCollection('NASA/GPM_L3/IMERG_V07').select('precipitation') # v07 (2000 - 2024) 
    

    # Define point of interest from stn list
    point = ee.Geometry.Point(lon, lat)

    # Initialize empty list to store yearly dataframes
    yearly_dfs = []

    # Define start and end dates for yearly chunks
    start_date = ee.Date('2000-01-01')
    end_date = ee.Date('2024-12-31')  # increased year from 2023 to 2024 for v07

    # Iterate over years and extract data in yearly chunks
    year = start_date.get('year')
    while year.getInfo() <= end_date.get('year').getInfo():
        
        # Define current year's date range
        dec31_start_date = ee.Date.fromYMD(year, 12, 31)
        dec31_end_date = dec31_start_date.advance(1, 'day')

        # Filter dataset by current year
        filtered = dataset.filterDate(dec31_start_date, dec31_end_date)

        # Extract time-series at the point
        ts = filtered.getRegion(point, scale=11132).getInfo()

        # Convert to DataFrame
        df = pd.DataFrame(ts[1:], columns=ts[0])
        # df = df[['time', 'precipitationCal']] # uncomment to run v06
        df = df[['time', 'precipitation']] # comment to run v07
        df['time'] = pd.to_datetime(df['time'], unit='ms')

        # Append yearly DataFrame to list
        yearly_dfs.append(df)

        # Move to the next year
        year = ee.Number(year).add(1)

    # Concatenate all yearly dataframes into one
    combined_df = pd.concat(yearly_dfs, ignore_index=True)

    return combined_df

# Read input CSV file
# input_file = 'stn.csv'  # full stations (2360 - stations)
input_file = 'stn_tt.csv'     # test - stations
output_folder = 'output_filesv7B/'

data = pd.read_csv(input_file)

# Process each row in the CSV
for index, row in data.iterrows():
    unique_id = str(int(row['ID']))  # Convert ID to string

    try:
        lat = row['Lat']
        lon = row['Lon']

        # Extract IMERG time series
        ts_df = extract_imerg_time_series(lat, lon)

        # Save output to CSV
        filename = f"ts_{unique_id}_{lat}_{lon}.csv"
        output_path = os.path.join(output_folder, filename)
        ts_df.to_csv(output_path, index=False)

        print(f"Processed ID: {unique_id}. Saved to {output_path}")

    except Exception as e:
        print(f"Error processing ID {unique_id}: {str(e)}")

    # Pause for 5 seconds to prevent memory issues and respect GEE limitations
    time.sleep(5)

print("All IDs processed.")


Processed ID: 930168. Saved to output_filesv7B/ts_930168_34.2016_-119.20700000000001.csv
Processed ID: 790061. Saved to output_filesv7B/ts_790061_29.98_-95.36.csv
Processed ID: 413415. Saved to output_filesv7B/ts_413415_33.6358_-97.1447.csv
Processed ID: 143984. Saved to output_filesv7B/ts_143984_37.9233_-95.4242.csv
Processed ID: 143810. Saved to output_filesv7B/ts_143810_39.6703_-95.5225.csv
Processed ID: 113290. Saved to output_filesv7B/ts_113290_41.8981_-90.1539.csv
Processed ID: 113262. Saved to output_filesv7B/ts_113262_42.2953_-89.6061.csv
Processed ID: 113109. Saved to output_filesv7B/ts_113109_38.7156_-88.5822.csv
Processed ID: 53488. Saved to output_filesv7B/ts_53488_39.1342_-108.5375.csv
Processed ID: 53477. Saved to output_filesv7B/ts_53477_38.0611_-102.3111.csv
Processed ID: 43791. Saved to output_filesv7B/ts_43791_40.3636_-122.965.csv
Processed ID: 43761. Saved to output_filesv7B/ts_43761_41.8042_-123.3758.csv
Processed ID: 15478. Saved to output_filesv7B/ts_15478_30.6883

#### **Merge december 31 to main df**

In [12]:
# Function to merge december 31 of every year to the main dataframe
def merge_dataframes(folder_v7A, folder_v7B, output_folder):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Get the list of files from both folders
    files_v7A = {f: os.path.join(folder_v7A, f) for f in os.listdir(folder_v7A) if f.endswith('.csv')}
    files_v7B = {f: os.path.join(folder_v7B, f) for f in os.listdir(folder_v7B) if f.endswith('.csv')}

    for file_name, path_v7A in files_v7A.items():
        # Extract the ID from the file name
        id_v7A = file_name.split('_')[1]

        # Check if a corresponding file exists in folder_v7B
        corresponding_file = next((f for f in files_v7B if f.split('_')[1] == id_v7A), None)

        if corresponding_file:
            path_v7B = files_v7B[corresponding_file]

            # Read the data from both files
            df_v7A = pd.read_csv(path_v7A)
            df_v7B = pd.read_csv(path_v7B)

            # Merge the dataframes
            merged_df = pd.concat([df_v7A, df_v7B])

            # Sort the merged dataframe by 'time'
            merged_df['time'] = pd.to_datetime(merged_df['time'])  # Ensure time is datetime
            merged_df.sort_values(by='time', inplace=True)

            # Write out the merged dataframe to the output folder
            output_path = os.path.join(output_folder, file_name)
            merged_df.to_csv(output_path, index=False)


# Run Function
folder_v7A = "output_filesv7A"
folder_v7B = "output_filesv7B"
output_folder = "output_filesv7"

merge_dataframes(folder_v7A, folder_v7B, output_folder)
print("All IDs merged completed.")

All IDs merged completed.


#### **Count Records**

In [16]:
# Function to check df record length
def count_df_records(imerg_folder, output_csv):
    # Define the header for the output CSV file
    output_data = [['ID', 'lat', 'lon', 'record', 'diff', 'start_datetime', 'end_datetime']]

    # Iterate through each file in the folder
    for file_name in os.listdir(imerg_folder):
        if file_name.endswith('.csv'):
            # Parse the file name to extract ID, lat, and lon
            parts = file_name.split('_')
            if len(parts) != 4 or not parts[1].isdigit():
                print(f"Skipping invalid file name format: {file_name}")
                continue
            
            station_id = parts[1]
            lat = parts[2]
            lon = parts[3].replace('.csv', '')

            # Construct the full file path
            file_path = os.path.join(imerg_folder, file_name)

            # Read the file and count the rows using the 'time' field
            try:
                df = pd.read_csv(file_path)
                record_count = len(df['time'])

                # Extract the first and last 'time' records
                start_datetime = df['time'].iloc[0] if not df.empty else None
                end_datetime = df['time'].iloc[-1] if not df.empty else None
            except Exception as e:
                print(f"Error processing file {file_name}: {e}")
                continue

            # Calculate the diff value
            diff = 438333 - record_count

            # Append the data to the output list
            output_data.append([station_id, lat, lon, record_count, diff, start_datetime, end_datetime])

    # Write the output data to the CSV file
    with open(output_csv, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerows(output_data)

# Run Function
Imerg_folder = "output_filesv7"
output_csv = "Imerg_data_check.csv"
count_df_records(Imerg_folder, output_csv)
print("File Checks completed.")

File Checks completed.
