In [None]:
################################## KENNETH EKPETERE #################################
################################# KM PERCENTILE STATS ###############################
##################################### (C) 2024  ########################################

In [1]:
import time
import pandas as pd
import numpy as np
import sys
from datetime import date
from datetime import datetime, timedelta
import math
import statistics
from math import exp
from scipy.stats import tmean, tstd

##### **KM STATS (From Annual Max)**

In [3]:
# Compute percentiles of Frequency Factor Across Durations
def compute_FreqFact_percentile(max_file, freq_percentile_file):
    # Step 1: Compute frequency factors for each unique ID
    # Read input data
    data = pd.read_csv(max_file, float_precision='round_trip')

    # Subset columns to process
    subset_columns = ['30-min', '1-hour', '2-hour', '3-hour', '6-hour', '12-hour', '24-hour', '48-hour', '72-hour']

    # Filter rows for years between 2001 and 2022 (inclusive)
    data_filtered = data[(data['year'] >= 2001) & (data['year'] <= 2022)]  # station
#     data_filtered = data[(data['year'] >= 2000) & (data['year'] <= 2024)]  # IMERG

    # Group data by unique ID
    grouped = data_filtered.groupby('ID')

    # Initialize empty list to hold DataFrame chunks
    output_chunks = []

    # Loop through each group
    for name, group in grouped:
        # Compute maximum value for each column
        max_pre = group[subset_columns].max()
        
        # Compute trimmed mean for each column
        trimmed_mean = group[subset_columns].apply(lambda x: np.mean(np.sort(x)[:-1]))

        # Compute trimmed standard deviation for each column
        trimmed_std = group[subset_columns].apply(lambda x: np.std(np.sort(x)[:-1], ddof=1))

        # Compute frequency factor for each column
        freqfact = ((max_pre - trimmed_mean) / trimmed_std)

        # Create DataFrame chunk for this group
        output_chunk = pd.DataFrame({
            'ID': name,
            '30-min': freqfact['30-min'],
            '1-hour': freqfact['1-hour'],
            '2-hour': freqfact['2-hour'],
            '3-hour': freqfact['3-hour'],
            '6-hour': freqfact['6-hour'],
            '12-hour': freqfact['12-hour'],
            '24-hour': freqfact['24-hour'],
            '48-hour': freqfact['48-hour'],
            '72-hour': freqfact['72-hour']
        }, index=[0])

        # Append chunk to list
        output_chunks.append(output_chunk)

    # Concatenate all chunks into final DataFrame
    freqfact_data = pd.concat(output_chunks, ignore_index=True)

    # Step 2: Calculate percentiles for frequency factors
    # Extract columns with frequency factors
    freqfact_values = freqfact_data[subset_columns]

    # Define percentile values to compute
    percentiles = [0, 2.5, 25, 50, 75, 97.5, 99, 99.5, 99.8, 100]

    # Compute percentiles for each duration
    percentile_data = pd.DataFrame({
        'km_percentile': percentiles
    })

    for column in subset_columns:
        percentile_data[column] = [np.percentile(freqfact_values[column].dropna(), p) for p in percentiles]

    # Step 3: Calculate mean and standard deviation for each duration
    mean_row = pd.DataFrame({
        'km_percentile': ['mean'],
        **{column: [freqfact_values[column].mean()] for column in subset_columns}
    })

    stdev_row = pd.DataFrame({
        'km_percentile': ['stdev'],
        **{column: [freqfact_values[column].std(ddof=1)] for column in subset_columns}
    })

    # Append mean and stdev rows to percentile data
    percentile_data = pd.concat([percentile_data, mean_row, stdev_row], ignore_index=True)

    # Step 4: Write the output CSV file
    percentile_data.to_csv(freq_percentile_file, index=False)

    print("Frequency factor percentile computation complete")

# # station Annual Maximums
# input_file = "annual_max_stn.csv"
# output_file = "km_percentile_annual_max_stn.csv"

# IMERG Annual Maximums
input_file = "annual_max.csv"
output_file = "km_percentile_annual_max.csv"

compute_FreqFact_percentile(input_file, output_file)
print("Done")


Frequency factor percentile computation complete
Done


##### **KM STATS (Alltime Max and Partial Duration Max)**

In [5]:
# Function to calculate frequency factor percentiles from alltime maximums
def compute_FreqFact_percentile(max_file, freq_percentile_file):
    # Step 1: Compute frequency factors for each unique ID
    # Read input data
    data = pd.read_csv(max_file, float_precision='round_trip')

    # Subset columns to process
    subset_columns = ['30-min', '1-hour', '2-hour', '3-hour', '6-hour', '12-hour', '24-hour', '48-hour', '72-hour']

    # Group data by unique ID
    grouped = data.groupby('ID')

    # Initialize empty list to hold DataFrame chunks
    output_chunks = []

    # Loop through each group
    for name, group in grouped:
        
        # For each column, get top 22 sorted values
        top_n = group[subset_columns].apply(lambda col: col.sort_values(ascending=False).head(22).reset_index(drop=True))

        # Compute max, mean, std, trimmed mean and std on the top 22 values
        max_pre = top_n.max()
        trimmed_mean = top_n.apply(lambda x: np.mean(np.sort(x)[:-1]))
        trimmed_std = top_n.apply(lambda x: np.std(np.sort(x)[:-1], ddof=1))

        # Compute frequency factor for each column
        freqfact = ((max_pre - trimmed_mean) / trimmed_std)

        # Create DataFrame chunk for this group
        output_chunk = pd.DataFrame({
            'ID': name,
            '30-min': freqfact['30-min'],
            '1-hour': freqfact['1-hour'],
            '2-hour': freqfact['2-hour'],
            '3-hour': freqfact['3-hour'],
            '6-hour': freqfact['6-hour'],
            '12-hour': freqfact['12-hour'],
            '24-hour': freqfact['24-hour'],
            '48-hour': freqfact['48-hour'],
            '72-hour': freqfact['72-hour']
        }, index=[0])

        # Append chunk to list
        output_chunks.append(output_chunk)

    # Concatenate all chunks into final DataFrame
    freqfact_data = pd.concat(output_chunks, ignore_index=True)

    # Step 2: Calculate percentiles for frequency factors
    # Extract columns with frequency factors
    freqfact_values = freqfact_data[subset_columns]

    # Define percentile values to compute
    percentiles = [0, 2.5, 25, 50, 75, 97.5, 99, 99.5, 99.8, 100]

    # Compute percentiles for each duration
    percentile_data = pd.DataFrame({
        'km_percentile': percentiles
    })

    for column in subset_columns:
        percentile_data[column] = [np.percentile(freqfact_values[column].dropna(), p) for p in percentiles]

    # Step 3: Calculate mean and standard deviation for each duration
    mean_row = pd.DataFrame({
        'km_percentile': ['mean'],
        **{column: [freqfact_values[column].mean()] for column in subset_columns}
    })

    stdev_row = pd.DataFrame({
        'km_percentile': ['stdev'],
        **{column: [freqfact_values[column].std(ddof=1)] for column in subset_columns}
    })

    # Append mean and stdev rows to percentile data
    percentile_data = pd.concat([percentile_data, mean_row, stdev_row], ignore_index=True)

    # Step 4: Write the output CSV file
    percentile_data.to_csv(freq_percentile_file, index=False)

    print("Frequency factor percentile computation complete")


# # Alltime Maximums
# input_file = "alltime_max.csv"
# output_file = "km_percentile_alltime_max.csv"


# Alltime Partial Duration Maximums
input_file = "alltime_partial_duration_max.csv"
output_file = "km_percentile_alltime_partial_duration_max.csv"

compute_FreqFact_percentile(input_file, output_file)
print("Done")


Frequency factor percentile computation complete
Done


##### **Extras**

In [None]:
# Function to calculate frequency factor percentiles from alltime maximums
def compute_FreqFact_percentile(max_file, freq_percentile_file):
    # Step 1: Compute frequency factors for each unique ID
    # Read input data
    data = pd.read_csv(max_file, float_precision='round_trip')

    # Subset columns to process
    subset_columns = ['30-min', '1-hour', '2-hour', '3-hour', '6-hour', '12-hour', '24-hour', '48-hour', '72-hour']

    # Group data by unique ID
    grouped = data.groupby('ID')

    # Initialize empty list to hold DataFrame chunks
    output_chunks = []

    # Loop through each group
    for name, group in grouped:
        # Compute maximum value for each column
        max_pre = group[subset_columns].max()
        
        # Compute trimmed mean for each column
        trimmed_mean = group[subset_columns].apply(lambda x: np.mean(np.sort(x)[:-1]))

        # Compute trimmed standard deviation for each column
        trimmed_std = group[subset_columns].apply(lambda x: np.std(np.sort(x)[:-1], ddof=1))

        # Compute frequency factor for each column
        freqfact = ((max_pre - trimmed_mean) / trimmed_std)

        # Create DataFrame chunk for this group
        output_chunk = pd.DataFrame({
            'ID': name,
            '30-min': freqfact['30-min'],
            '1-hour': freqfact['1-hour'],
            '2-hour': freqfact['2-hour'],
            '3-hour': freqfact['3-hour'],
            '6-hour': freqfact['6-hour'],
            '12-hour': freqfact['12-hour'],
            '24-hour': freqfact['24-hour'],
            '48-hour': freqfact['48-hour'],
            '72-hour': freqfact['72-hour']
        }, index=[0])

        # Append chunk to list
        output_chunks.append(output_chunk)

    # Concatenate all chunks into final DataFrame
    freqfact_data = pd.concat(output_chunks, ignore_index=True)

    # Step 2: Calculate percentiles for frequency factors
    # Extract columns with frequency factors
    freqfact_values = freqfact_data[subset_columns]

    # Define percentile values to compute
    percentiles = [0, 2.5, 25, 50, 75, 97.5, 99, 99.5, 99.8, 100]

    # Compute percentiles for each duration
    percentile_data = pd.DataFrame({
        'km_percentile': percentiles
    })

    for column in subset_columns:
        percentile_data[column] = [np.percentile(freqfact_values[column].dropna(), p) for p in percentiles]

    # Step 3: Calculate mean and standard deviation for each duration
    mean_row = pd.DataFrame({
        'km_percentile': ['mean'],
        **{column: [freqfact_values[column].mean()] for column in subset_columns}
    })

    stdev_row = pd.DataFrame({
        'km_percentile': ['stdev'],
        **{column: [freqfact_values[column].std(ddof=1)] for column in subset_columns}
    })

    # Append mean and stdev rows to percentile data
    percentile_data = pd.concat([percentile_data, mean_row, stdev_row], ignore_index=True)

    # Step 4: Write the output CSV file
    percentile_data.to_csv(freq_percentile_file, index=False)

    print("Frequency factor percentile computation complete")

# Alltime Maximums
input_file = "alltime_max.csv"
output_file = "km_percentile_alltime_max.csv"

compute_FreqFact_percentile(input_file, output_file)
print("Done")
