In [None]:
"""
Code Description:
This script performs the following tasks to consolidate and manage IM (Intensity Measure) data 
from existing and new realizations:

1. **Data Consolidation**:
   - Combines all IM values from existing and new realizations into a newly generated combined directory.
   - Identifies common stations and IM types for each source (fault) across old and new realizations.
   - Filters out uncommon stations and IM types to ensure consistency.

2. **Median Calculation**:
   - Computes the median IM values for each station and IM type from all realizations.
   - Saves the computed median values as a CSV file.

3. **Source File Management**:
   - Copies all "Source" files from old and new realizations into the "Source" directory in the combined folder.
   - Copies `nzvm.cfg` and `vm_params.yaml` files from the old realizations as-is.
   - Copies these files from the new realizations with a "_new" suffix added to the filenames for differentiation.

4. **Summary Reports**:
   - Generates summary reports, including an Excel file, that details the number of stations and IMs processed.
   - Adds a note file providing context and metadata about the combined data.

Author: Morteza
Version History:
- Version 1.0: August 18, 2024
- Version 1.1: December 18, 2024
"""

# Import the dependencies

In [5]:
import os
import shutil
from pathlib import Path
from tqdm import tqdm

import pandas as pd
from datetime import datetime
import xarray as xr

# Path and directory works

In [2]:
file_path = os.getcwd()
root_path = "/".join(file_path.split("/")[:4])

# Get source folder name as user input
# Existing_RL_Name = input("Please Input the Source Folder Name of Existing Realizations: ")
# New_RL_Name = input("Please Input the Source Folder Name of New Realization: ")
Existing_RL_Name = 'v20p6'
New_RL_Name = 'v24p8'

# Print the result
print(f"User input source folder name for Existing Realizations is: {Existing_RL_Name}")
print(f"User input source folder name for New Realizations is: {New_RL_Name}")


old_cs_files_path = os.path.join(root_path, 'Cybershake_Data', Existing_RL_Name)
new_cs_files_path = os.path.join(root_path, 'Cybershake_Data', New_RL_Name)
combined_cs_files_path = os.path.join(
    root_path, "Cybershake_Data", f"combined_{Existing_RL_Name}_{New_RL_Name}"
)

# Check if the directory exists
if Path(combined_cs_files_path).exists():
    prompt = input(
        f"The following path already exists! Do you want to delete and renew (1) or terminate (2)? \n{combined_cs_files_path}\n"
    )
    if prompt == "1":
        # Remove the folder
        shutil.rmtree(combined_cs_files_path)
        print(f"Deleted and renewed the path: \n{combined_cs_files_path}")
        os.makedirs(combined_cs_files_path, exist_ok=False)
    elif prompt == "2":
        print("Terminating the process.")
        exit()
    else:
        print("Invalid input. Terminating the process.")
        exit()
else:
    os.makedirs(combined_cs_files_path, exist_ok=False)
    print(f"Created the path: \n{combined_cs_files_path}")

User input source folder name for Existing Realizations is: v20p6
User input source folder name for New Realizations is: v24p8
Deleted and renewed the path: 
/mnt/hypo_data/mab419/Cybershake_Data/combined_v20p6_v24p8


# Merge Files

In [3]:
def strip_and_convert(value):
    """
    Strips leading/trailing whitespace or single quotes from a value
    and converts it to a float if possible.
    """
    try:
        return float(str(value).strip("'"))
    except ValueError:
        return value  # Return as-is if it can't be converted


summary_data = []
station_im_data = []

im_true_order = [
    "PGA",
    "PGV",
    "CAV",
    "AI",
    "Ds575",
    "Ds595",
    "MMI",
    "pSA_0.01",
    "pSA_0.02",
    "pSA_0.03",
    "pSA_0.04",
    "pSA_0.05",
    "pSA_0.075",
    "pSA_0.1",
    "pSA_0.12",
    "pSA_0.15",
    "pSA_0.17",
    "pSA_0.2",
    "pSA_0.25",
    "pSA_0.3",
    "pSA_0.4",
    "pSA_0.5",
    "pSA_0.6",
    "pSA_0.7",
    "pSA_0.75",
    "pSA_0.8",
    "pSA_0.9",
    "pSA_1.0",
    "pSA_1.25",
    "pSA_1.5",
    "pSA_2.0",
    "pSA_2.5",
    "pSA_3.0",
    "pSA_4.0",
    "pSA_5.0",
    "pSA_6.0",
    "pSA_7.5",
    "pSA_10.0",
]


faults_old = [
    cur_dir.stem for cur_dir in Path( ).iterdir() if cur_dir.is_dir()
]
faults_new = [
    cur_dir.stem for cur_dir in Path(new_cs_files_path).iterdir() if cur_dir.is_dir()
]

# Check if both fault lists are the same
if faults_old == faults_new:
    print("Both realization directories share the same faults!")
else:
    print("!!!! Caution !!!! >> Faults in the introduced directories are not the same!")
    prompt = input("Continue (1) or Terminate (2)? ")
    if prompt == "2":
        print("Terminating the process.")
        exit()
    elif prompt != "1":
        print("Invalid input. Terminating the process.")
        exit()


for cur_fault in tqdm(faults_old):

    pure_im_data_list = []
    old_file_counter = 0
    new_file_counter = 0

    ########################### working on IM files ###########################
    cur_old_im_files = list(
        (Path(old_cs_files_path) / cur_fault / "IM").rglob("*REL*.csv")
    )
    cur_new_im_files = list(
        (Path(new_cs_files_path) / cur_fault / "IM").rglob("*REL*.csv")
    )

    if not cur_old_im_files or not cur_new_im_files:
        print(f"No matching files found for fault: {cur_fault}")
        continue

    # Read the first file from old realizations
    first_old_file = cur_old_im_files[0]
    old_im_df = pd.read_csv(first_old_file)

    # Extract stations and IMs from the old realization
    old_stations = set(old_im_df[old_im_df["component"] == "geom"]["station"])
    old_ims = set(old_im_df.columns[2:])
    corrected_ims_old = {
        im[:5] + im[5:].replace("p", ".", 1) if im.startswith("pSA_") else im
        for im in old_ims
    }

    # Read the first file from new realizations
    first_new_file = cur_new_im_files[0]
    new_im_df = pd.read_csv(first_new_file)

    # Extract stations and IMs from the new realization
    new_stations = set(new_im_df[new_im_df["component"] == "geom"]["station"])
    new_ims = set(new_im_df.columns[2:])
    corrected_ims_new = {
        im[:5] + im[5:].replace("p", ".", 1) if im.startswith("pSA_") else im
        for im in old_ims
    }

    # Find common stations and IMs
    common_stations = old_stations & new_stations
    common_ims = corrected_ims_old & corrected_ims_new
    common_ims_list = sorted(list(common_ims), key=im_true_order.index)

    # Process old realizations
    for cur_im_file in cur_old_im_files:
        # Read the current IM file
        cur_im_df = pd.read_csv(
            cur_im_file,
            converters={
                col: strip_and_convert for col in range(2, len(common_ims_list) + 5)
            },
        )

        # Filter to retain only common stations and common IMs
        temp_df1 = cur_im_df[
            (cur_im_df["station"].isin(common_stations))
            & (cur_im_df["component"] == "geom")
        ]

        temp_df1.columns = [
            (
                col[:5] + col[5:].replace("p", ".", 1)
                if col.startswith("pSA_") and "p" in col[5:]
                else col
            )
            for col in temp_df1.columns
        ]

        filtered_im_df = temp_df1[["station", "component"] + common_ims_list]

        #  Convert data to xarray format
        temp_df = filtered_im_df.set_index("station").drop("component", axis=1)

        temp_da = xr.DataArray(
            temp_df.values,
            dims=("station", "IM"),  # Define dimensions
            coords={
                "station": temp_df.index,
                "IM": temp_df.columns,
            },
        )

        pure_im_data_list.append(temp_da)

        # Define the target file path
        target_im_file = (
            Path(combined_cs_files_path) / cur_fault / "IM" / cur_im_file.name
        )
        target_im_file.parent.mkdir(parents=True, exist_ok=True)

        # Save the filtered data to the target file
        filtered_im_df.to_csv(target_im_file, index=False)

        # Update counters and summary
        old_file_counter += 1
        summary_data.append(
            {
                "Source": cur_fault,
                "RL": cur_im_file.stem,
                "RL_Name": Existing_RL_Name,
            }
        )

    # Process new realizations
    for cur_im_file in cur_new_im_files:
        # Read the current IM file
        cur_im_df = pd.read_csv(
            cur_im_file, converters={col: strip_and_convert for col in range(2, len(common_ims_list) + 5)}
        )

        # Filter to retain only common stations and common IMs
        temp_df1 = cur_im_df[
            (cur_im_df["station"].isin(common_stations))
            & (cur_im_df["component"] == "geom")
        ]

        temp_df1.columns = [
            (
                col[:5] + col[5:].replace("p", ".", 1)
                if col.startswith("pSA_") and "p" in col[5:]
                else col
            )
            for col in temp_df1.columns
        ]

        filtered_im_df = temp_df1[["station", "component"] + common_ims_list]

        #  Convert data to xarray format
        temp_df = filtered_im_df.set_index("station").drop("component", axis=1)

        temp_da = xr.DataArray(
            temp_df.values,
            dims=("station", "IM"),  # Define dimensions
            coords={
                "station": temp_df.index,
                "IM": temp_df.columns,
            },
        )

        pure_im_data_list.append(temp_da)

        # Define the target file path
        target_im_file = (
            Path(combined_cs_files_path) / cur_fault / "IM" / cur_im_file.name
        )
        target_im_file.parent.mkdir(parents=True, exist_ok=True)

        # Save the filtered data to the target file
        filtered_im_df.to_csv(target_im_file, index=False)

        # Update counters and summary
        new_file_counter += 1
        summary_data.append(
            {
                "Source": cur_fault,
                "RL": cur_im_file.stem,
                "RL_Name": New_RL_Name,
            }
        )

    # Combine all data into a single xarray Dataset
    combined_da = xr.concat(pure_im_data_list, dim="file_id")

    # Create Dataset with labeled dimensions
    im_data_ds = xr.Dataset(
        {"IM_values": combined_da},
        coords={
            "station": list(common_stations),
            "IM": common_ims_list,
            "file_id": range(old_file_counter + new_file_counter),
        },
    )

    im_data_ds["IM_values"].values = im_data_ds["IM_values"].values.astype(float)

    # compute the median value and save the median file
    # Step 1: Calculate the median across 'file_id'
    median_2d_array = im_data_ds["IM_values"].median(dim="file_id")

    # Step 2: Convert to DataFrame and prepare the output format
    median_df = pd.DataFrame(
        median_2d_array.values, 
        index=im_data_ds["station"].values, 
        columns=im_data_ds["IM"].values
    )

    # Step 3: Add 'station' and 'component' columns
    median_df.insert(0, "station", median_df.index)  # Insert 'station' as the first column
    median_df.insert(1, "component", "geom")         # Add 'component' column with "geom" value

    # Step 4: Write the median DataFrame to a CSV file
    median_im_file = (
        Path(combined_cs_files_path) / cur_fault / "IM" / f"{cur_fault}.csv"
    )
    median_df.to_csv(median_im_file, index=False)

    station_im_data.append(
        {
            "Source": cur_fault,
            "N_Old_stations": len(old_stations),
            "N_New_Stations": len(new_stations),
            "N_Comb_Stations": len(common_stations),
            "N_Old_IMs": len(old_ims),
            "N_New_IMs": len(new_ims),
            "N_Comb_IMs": len(common_ims),
        }
    )

    ########################### working on Source files ###########################
    cur_old_source_dir = Path(old_cs_files_path) / cur_fault / "Source"
    cur_comb_source_dir = Path(combined_cs_files_path) / cur_fault / "Source"

    shutil.copytree(cur_old_source_dir, cur_comb_source_dir, dirs_exist_ok=True)

    cur_new_source_dir = Path(new_cs_files_path) / cur_fault / "Source"
    for file in cur_new_source_dir.glob("*.*"):  # Match all files
        if file.suffix in [".csv", ".info"]:  # Filter for .csv and .info files
            shutil.copy2(file, cur_comb_source_dir)
        elif file.suffix in [".cfg", ".yaml"]:
            new_file_name = f"{file.stem}_new{file.suffix}"
            shutil.copy2(file, cur_comb_source_dir / new_file_name)

Both realization directories share the same faults!


100%|██████████| 478/478 [1:53:00<00:00, 14.19s/it]  


# Create Report

In [4]:
# Convert summary data to DataFrame
summary_df = pd.DataFrame(summary_data).sort_values(by="Source")

# Create the final summary table
final_summary = summary_df.groupby(["Source", "RL_Name"]).size().unstack(fill_value=0)
final_summary["Total"] = final_summary.sum(axis=1)

# Add the overall totals row
overall_totals = final_summary.sum().rename("All")
final_summary = pd.concat([overall_totals.to_frame().T, final_summary])

# Save to CSV
summary_file_path = os.path.join(combined_cs_files_path, "summary.csv")
final_summary.to_csv(summary_file_path)
print(f"Summary data saved to {summary_file_path}")

# Convert station_im_summary data to DataFrame
station_im_summary_df = pd.DataFrame(station_im_data).sort_values(by="Source")

# Save to CSV
station_im_summary_file_path = os.path.join(combined_cs_files_path, "station_im_summary.csv")
station_im_summary_df.to_csv(station_im_summary_file_path)
print(f"Stations and IMs data saved to {station_im_summary_file_path}")

# Write Note
Current_Date_and_Time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
note_file_path = os.path.join(combined_cs_files_path, 'note.txt')

note_content = (
    f"- This folder contains combined IM (Intensity Measure) data from {Existing_RL_Name} and {New_RL_Name} "
    f"realizations of the Cybershake project.\n"
    f"- Date of generation: {Current_Date_and_Time}\n"
    f"- Key Notes:\n"
    f"    1. Only IM values common to both old and new realizations have been combined.\n"
    f"    2. For each fault (source), only the common stations and IM types between the realizations have been retained.\n"
    f"    3. Median IM values for each station and IM type have been computed and saved as a separate CSV file.\n"
    f"    4. Source files have been copied:\n"
    f"       - 'nzvm.cfg' and 'vm_params.yaml' from the old realizations are included as-is.\n"
    f"       - Corresponding files from the new realizations have been copied with a '_new' suffix.\n"
)


# Write to file
with open(note_file_path, "w") as file:
    file.write(note_content)

print(f"Note written to {note_file_path}")

Summary data saved to /mnt/hypo_data/mab419/Cybershake_Data/combined_v20p6_v24p8/summary.csv
Stations and IMs data saved to /mnt/hypo_data/mab419/Cybershake_Data/combined_v20p6_v24p8/station_im_summary.csv
Note written to /mnt/hypo_data/mab419/Cybershake_Data/combined_v20p6_v24p8/note.txt
