In [None]:
#import necessary libraries
from pathlib import Path
import pandas as pd
import networkx as nx
import numpy as np
import geopandas as gpd
from matplotlib.colors import LinearSegmentedColormap
import scipy.stats as stats

# Converting Node Observations into Network Observations

In [None]:
PATH_TO_DATA = Path("..") / "Data" / "SurveyData.xlsx"

# Load raw (unprocessed) data from Excel using correct sheet names
raw_network_data = {
    'TribA': {
        'nodes': pd.read_excel(PATH_TO_DATA, sheet_name="TribA_Survey_Data", header=0),
        'confluences': pd.read_excel(PATH_TO_DATA, sheet_name="TribA_Ghost_Nodes"),
        'stretches': pd.read_excel(PATH_TO_DATA, sheet_name="TribA_Edges").astype(str)
    },
    'TribB': {
        'nodes': pd.read_excel(PATH_TO_DATA, sheet_name="TribB_Survey_Data", header=0),
        'confluences': pd.read_excel(PATH_TO_DATA, sheet_name="TribB_Ghost_Nodes"),
        'stretches': pd.read_excel(PATH_TO_DATA, sheet_name="TribB_Edges").astype(str)
    },
    'TribC': {
        'nodes': pd.read_excel(PATH_TO_DATA, sheet_name="TribC_Survey_Data", header=0),
        'confluences': pd.read_excel(PATH_TO_DATA, sheet_name="TribC_Ghost_Nodes"),
        'stretches': pd.read_excel(PATH_TO_DATA, sheet_name="TribC_Edges").astype(str)
    }
}


In [None]:
# Define how to convert survey values to boolean
def convert_to_bool_nodes(df):
    df_bool = df.copy()
    survey_cols = [col for col in df.columns if "202" in col]  # Detect date columns
    for col in survey_cols:
        df_bool[col] = df[col].map({
            "Yes": True,
            "No": False
        })
    return df_bool

# Create a new dictionary with boolean-processed 'nodes'
bool_network_data = {}

for trib, data in raw_network_data.items():
    bool_network_data[trib] = {
        'nodes': convert_to_bool_nodes(data['nodes']),
        'confluences': data['confluences'],
        'stretches': data['stretches']
    }

In [None]:
def calculate_network(survey_date, nodes_data_bool, confluence_nodes, stretches, trib_key):
    # Ensure the None values are correctly registered
    nodes_data_bool = nodes_data_bool.where(nodes_data_bool.notnull(), None)

    # Create directed network
    G = nx.DiGraph()

    # Create nodes and add the attribute corresponding to data
    for _, data in nodes_data_bool.iterrows():
        node_ID = str(data["Point ID"])
        G.add_node(node_ID, pos=(data['Longitude'], data['Latitude']), node_activity=data[survey_date])

    # Add confluence nodes
    for _, row in confluence_nodes.iterrows():
        G.add_node(row['Node'], pos=(row['Longitude'], row['Latitude']), node_activity=None)  # Set a default value

    # Explicitly set activity for a necessary node (e.g., those that correspond to Little Calumet River)
    if trib_key == 'TribB':
        g6_node = "g6"  
        if g6_node in G.nodes:
            G.nodes[g6_node]["node_activity"] = True

    # Add network edges // river stretches
    for _, row in stretches.iterrows():
        if row["Node Start"] in G.nodes and row["Node End"] in G.nodes:
            G.add_edge(row["Node Start"], row["Node End"])

    # Definitions to calculate the value of each node
    def find_upstream(node_ID):
        if G.nodes[node_ID].get("node_activity", None) is not None:
            return G.nodes[node_ID]["node_activity"]
        
        is_upstream_wet = None
        for up_index, _ in G.in_edges(node_ID):
            res = find_upstream(up_index)
            if res is not None:
                is_upstream_wet = is_upstream_wet or res
        return is_upstream_wet
        
    def find_downstream(node_ID):
        if G.nodes[node_ID].get("node_activity", None) is not None:
            return G.nodes[node_ID]["node_activity"]
        
        is_downstream_wet = None
        for _, down_index in G.out_edges(node_ID):
            res = find_downstream(down_index)
            if res is not None:
                is_downstream_wet = is_downstream_wet or res
        return is_downstream_wet

    # First pass: Initial assignment of node activity
    for node_ID in G.nodes:
        upstream = find_upstream(node_ID)
        downstream = find_downstream(node_ID)
        if upstream == downstream:
            G.nodes[node_ID]["node_activity"] = upstream
            
    # # Check that all nodes have been assigned
    # for node_ID in G.nodes:
    #     node_attrs = G.nodes[node_ID]
    #     if node_attrs.get("node_activity", None) is None:
    #         print(node_ID)

    # Second pass: Assign "disconnected" or "indeterminate"
    for node_ID in G.nodes:
        if G.nodes[node_ID]["node_activity"] is None:
            upstream = find_upstream(node_ID)
            downstream = find_downstream(node_ID)

            if upstream is not None and downstream is not None:
                if upstream != downstream:
                    G.nodes[node_ID]["node_activity"] = "disconnected"
            else:
                G.nodes[node_ID]["node_activity"] = None


    # Assign stretch attribute based on node activity
    for up_index, down_index in G.edges:
        up_activity = G.nodes[up_index].get("node_activity")
        down_activity = G.nodes[down_index].get("node_activity")
        
        if up_activity is True and down_activity is True:
            G.edges[up_index, down_index]["color"] = 'blue'
        elif up_activity is False and down_activity is False:
            G.edges[up_index, down_index]["color"] = 'red'
        elif up_activity is True and down_activity is False:
            G.edges[up_index, down_index]["color"] = 'orange'
        elif up_activity is False and down_activity is True:
            G.edges[up_index, down_index]["color"] = 'orange'
        elif "disconnected" in (up_activity, down_activity):
            G.edges[up_index, down_index]["color"] = 'orange'
        else:
            G.edges[up_index, down_index]["color"] = 'grey'

    # Assign activity to stretches
    for up_index, down_index in G.edges:
        up_activity = G.nodes[up_index].get("node_activity")
        down_activity = G.nodes[down_index].get("node_activity")
        
        if up_activity is True and down_activity is True:
            G.edges[up_index, down_index]["stretch_activity"] = 'Active'
        elif up_activity is False and down_activity is False:
            G.edges[up_index, down_index]["stretch_activity"] = 'Not Active'
        elif up_activity is True and down_activity is False:
            G.edges[up_index, down_index]["stretch_activity"] = 'Disconnected'
        elif up_activity is False and down_activity is True:
            G.edges[up_index, down_index]["stretch_activity"] = 'Disconnected'
        elif "disconnected" in (up_activity, down_activity):
            G.edges[up_index, down_index]["stretch_activity"] = 'Disconnected'
        else:
            G.edges[up_index, down_index]["stretch_activity"] = 'Not enough information'

    return G


In [None]:
def collect_stretch_activities(trib_key, survey_dates, bool_network_data):
    # Extract the relevant data for the tributary
    nodes_data_bool = bool_network_data[trib_key]['nodes']
    confluence_nodes = bool_network_data[trib_key]['confluences']
    stretches = bool_network_data[trib_key]['stretches']
    
    # Dictionary to hold edge activity data
    edge_data_dict = {}

    for date in survey_dates:
        G = calculate_network(date, nodes_data_bool, confluence_nodes, stretches, trib_key)

        for edge in G.edges:
            stretch_status = G.edges[edge].get("stretch_activity", 'Not available')
            edge_key = (edge[0], edge[1])
            if edge_key not in edge_data_dict:
                edge_data_dict[edge_key] = {}
            edge_data_dict[edge_key][f'{date}'] = stretch_status

    # Build the stretch activity DataFrame
    all_stretch_data = []
    for (node_start, node_end), activities in edge_data_dict.items():
        edge_info = {'Node Start': node_start, 'Node End': node_end}
        edge_info.update(activities)
        all_stretch_data.append(edge_info)

    stretches_activity_df = pd.DataFrame(all_stretch_data)

    # Merge stretch lengths
    stretches_data = pd.merge(
        stretches_activity_df,
        stretches[['Node Start', 'Node End', 'Stretch Length (km)']],
        on=['Node Start', 'Node End'],
        how='left'
    )

    # Reorder columns: length after Node End
    column_order = ['Node Start', 'Node End', 'Stretch Length (km)'] + \
                   [col for col in stretches_data.columns if col not in ['Node Start', 'Node End', 'Stretch Length (km)']]
    stretches_data = stretches_data[column_order]

    return stretches_data


In [None]:
# Dictionary to hold results for each tributary
all_stretch_results = {}

# Loop over each tributary in the bool_network_data
for trib in bool_network_data:
    # Get all survey dates for the current tributary
    survey_dates = [col for col in bool_network_data[trib]['nodes'].columns if "202" in col]
    
    # Calculate stretch activity DataFrame
    stretch_df = collect_stretch_activities(trib, survey_dates, bool_network_data)
    
    # Store result
    all_stretch_results[trib] = stretch_df

    print(f"Processed stretch activity for {trib}")


# Computing Persistency Index (P_i)

In [None]:
#Calculate persistency index (P_i) for each stretch
def persistence_index(df):
    # Identify columns related to Stretch Activity
    activity_columns = [col for col in df.columns if '202' in col]
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_numeric = df.copy()
    
    # Replace activity status with numeric values
    pd.set_option('future.no_silent_downcasting', True) #To avoid future warning
    
    df['Stretch Length (km)'] = pd.to_numeric(df['Stretch Length (km)'], errors='coerce')
    df_numeric[activity_columns] = df_numeric[activity_columns].replace({
        'Active': 1,
        'Not Active': 0,
        'Disconnected':0.5,
        'Not enough information': np.nan
    }).astype(float)
    
    # Calculate the number of 'Active' states
    df['Active Count'] = df_numeric[activity_columns].sum(axis=1)
    
    # Calculate the total number of valid observations
    df['Valid Observations Count'] = df_numeric[activity_columns].notna().sum(axis=1)
    
    # Compute the proportion of 'Active' states
    df['Persistency Index (P_i)'] = df['Active Count'] / df['Valid Observations Count']
    
    # Drop intermediate columns used for calculation
    df.drop(columns=['Active Count', 'Valid Observations Count'], inplace=True)
    
    # Reorder columns to place Persistency Index next to Stretch Length column
    columns = list(df.columns)
    stretch_length_index = columns.index('Stretch Length (km)')
    # Move 'Persistency Index (P_i)' next to 'Stretch Length (km)'
    columns.insert(stretch_length_index + 1, columns.pop(columns.index('Persistency Index (P_i)')))
    df = df[columns]
    

    return df

In [None]:
for trib, df in all_stretch_results.items():
    all_stretch_results[trib] = persistence_index(df)
    print(f"Calculated Persistency Index for {trib}")


In [None]:
for trib, stretches_data in all_stretch_results.items():
    print(f"--- Results for {trib} ---")
    
    # Count stretches with Persistency Index = 1
    persistency_1_count = stretches_data[stretches_data['Persistency Index (P_i)'] == 1].shape[0]
    print(f"Number of stretches with persistency = 1: {persistency_1_count}")
    
    # Total length of stretches with Persistency Index = 1
    total_length_persistency_1 = stretches_data[stretches_data['Persistency Index (P_i)'] == 1]['Stretch Length (km)'].sum()
    print(f"Total length of stretches with persistency = 1: {total_length_persistency_1:.2f} km")
    
    # Total length of all stretches
    total_length = stretches_data['Stretch Length (km)'].sum()
    print(f"Total length of all stretches: {total_length:.2f} km")
    
    # Calculate ephemeral percentage
    percentage_ephemeral = ((total_length - total_length_persistency_1) / total_length) * 100
    print(f"Ephemeral percentage: {percentage_ephemeral:.2f}%")
    
    # Mean Persistency Index (P_i)
    mean_pi = stretches_data['Persistency Index (P_i)'].mean()
    print(f"Mean Persistency Index (P_i): {mean_pi:.3f}")
    
    # Mean Persistency Index (P_i) for stretches where P_i != 1
    mean_pi_not_1 = stretches_data[stretches_data['Persistency Index (P_i)'] != 1]['Persistency Index (P_i)'].mean()
    print(f"Mean Persistency Index (P_i) for stretches where P_i ≠ 1: {mean_pi_not_1:.3f}\n")


# Computing ADNL

In [None]:
def calculate_ADNL(df, survey_date):
    # Create dictionaries to store results for each date
    ADNL = {}
    ADNL_p = {}
    ADNL_p_o = {}
    total_length = {date: 0 for date in survey_date}

    # Process each survey date
    for date in survey_date:

        # Create a directed graph for the current date
        G = nx.DiGraph()

        # Add edges to the graph for the current date
        for _, row in df.iterrows():
            G.add_edge(row['Node Start'], row['Node End'], 
                       length=row['Stretch Length (km)'], 
                       activity=row[date])

        # Find all downstream nodes (nodes with no outgoing edges)
        downstream_nodes = [node for node in G.nodes if G.out_degree(node) == 0]

        def dfs(node, visited):
            if node in visited:
                return  # Exit if this node has already been processed
            visited.add(node)  # Mark the node as visited

            incoming_edges = list(G.in_edges(node))
            for u, v in incoming_edges:
                activity = G.edges[u, v].get('activity', '')
                stretch_length = G.edges[u, v].get('length', 0)

                if activity == 'Active':
                    total_length[date] += stretch_length
                    dfs(u, visited)

        # Start DFS from each downstream node
        for node in downstream_nodes:
            visited = set()  # Reset visited for each downstream node
            dfs(node, visited)

        # Calculate the total active length for the current survey date
        ADNL[date] = total_length[date]

        # Calculate total stretch length from the stretches_data DataFrame
        total_stretch_length = stretches_data['Stretch Length (km)'].sum()

        # Filter rows where the activity status is 'Active', 'Inactive', or 'Disconnected'
        valid_rows = df[date].isin(['Active', 'Not Active', 'Disconnected'])
        total_observed_stretch_length = df.loc[valid_rows, 'Stretch Length (km)'].sum()

        # Calculate the percentage of ADNL based on the total stretch length
        ADNL_p[date] = ADNL[date] / total_stretch_length if total_stretch_length > 0 else np.nan

        # Calculate the percentage of ADNL based on the observed stretch length
        ADNL_p_o[date] = ADNL[date] / total_observed_stretch_length if total_observed_stretch_length > 0 else np.nan

    # Convert results to a DataFrame
    ANDL_df = pd.DataFrame({
        'Survey Date': list(ADNL.keys()),
        'Active Network Drainage Length': list(ADNL.values()),
        'Percentage of Total Network': [ADNL_p[date] for date in ADNL.keys()],
        'Percentage of Observed Network': [ADNL_p_o[date] for date in ADNL.keys()]
    })

    return ANDL_df


In [None]:
ADNL_results = {}

for trib, stretches_data in all_stretch_results.items():
    # Calculate ADNL DataFrame for this tributary
    ADNL_df = calculate_ADNL(stretches_data, survey_dates)
    
    # Store result keyed by tributary
    ADNL_results[trib] = ADNL_df
    
ADNL_combined = []

for trib, adnl_df in ADNL_results.items():
    # Filter rows with no missing information
    filtered_df = adnl_df[adnl_df['Percentage of Observed Network'] == adnl_df['Percentage of Total Network']].copy()
    
    # Add a tributary column to keep track of source
    filtered_df['Tributary'] = trib
    
    ADNL_combined.append(filtered_df)

# Concatenate all filtered ADNL dataframes
ADNL_df = pd.concat(ADNL_combined, ignore_index=True)
ADNL_df



In [None]:
# Group by 'Drainage_ID' and calculate the max and min total length for each group
drainage_length_stats = ADNL_df.groupby('Tributary')['Percentage of Total Network'].agg(['max', 'min', 'mean', 'median', 'std'])
drainage_length_stats

# Z-score analysis for each tributary

In [None]:
from scipy.stats import ttest_ind, mannwhitneyu
import pandas as pd

In [None]:
df = pd.read_excel("SummaryStatistics.xlsx", sheet_name="Sheet1", header=0)
columns = list(df.columns)
df

In [None]:
# Identify metrics (excluding Catchment ID)
metrics = [col for col in df.columns if col != 'Catchment ID']

# Separate LCRW row
lcrw = df[df['Catchment ID'] == 'LCRW'].iloc[0]

# Subset only tributaries
tribs = df[df['Catchment ID'].str.contains('Tributary')].copy()

# Compute mean and std for each metric across tributaries
trib_mean = tribs[metrics].mean()
trib_std = tribs[metrics].std()

# Calculate z-scores for each tributary relative to LCRW baseline using LCRW mean/std
for metric in metrics:
    # z = (tributary_value - LCRW_value) / std_of_tributaries
    tribs[f'z_LCRW_{metric}'] = (tribs[metric] - lcrw[metric]) / trib_std[metric] # How different is each tributary from the overall watershed?
    
# Calculate z-scores for each tributary relative to other tributaries using trib mean/std
for metric in metrics:
    # z = (tributary_value - LCRW_value) / std_of_tributaries
    tribs[f'z_trib_{metric}'] = (tribs[metric] - tribs[metric].mean()) / trib_std[metric] # How different is each tributary from the others?

tribs

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import textwrap
from matplotlib.patches import Patch

# -------- Set up the metric labels --------
metrics = [col for col in df.columns if not col.startswith('z') and col != 'Catchment ID']
wrapped_metrics = ['\n'.join(textwrap.wrap(m, width=14)) for m in metrics]

# -------- Define color codes --------
color_codes = {
    'both': "#3d3e3c",       # dark gray
    'lcrw': "#B4B4B2",       # light gray
    'tribs': "#616162",      # medium gray
    'none': 'white'          # no significance
}

# -------- Assign colors --------
def assign_color(row):
    colors = []
    for m in metrics:
        from_lcrw = abs(row[f'z_LCRW_{m}']) > 1
        from_tribs = abs(row[f'z_trib_{m}']) > 1

        if from_lcrw and from_tribs:
            colors.append(color_codes['both'])
        elif from_lcrw:
            colors.append(color_codes['lcrw'])
        elif from_tribs:
            colors.append(color_codes['tribs'])
        else:
            colors.append(color_codes['none'])
    return colors

# Create color matrix for plotting
color_matrix = tribs.apply(assign_color, axis=1, result_type='expand')
color_matrix.columns = metrics
color_matrix.index = tribs['Catchment ID']

# -------- Plot --------
fig, ax = plt.subplots(figsize=(len(metrics)*1.2, len(color_matrix)*1.2))

# Plot base grid
sns.heatmap(
    np.ones_like(color_matrix.values, dtype=float),
    cmap=["white"],
    cbar=False,
    linewidths=1,
    linecolor='gray',
    xticklabels=wrapped_metrics,
    yticklabels=color_matrix.index,
    ax=ax
)

# Overlay colors
for y in range(color_matrix.shape[0]):
    for x in range(color_matrix.shape[1]):
        rect = plt.Rectangle((x, y), 1, 1,
                             facecolor=color_matrix.iloc[y, x],
                             edgecolor='gray')
        ax.add_patch(rect)

# Add custom legend
legend_elements = [
    Patch(facecolor=color_codes['both'], edgecolor='gray', label='Different from both'),
    Patch(facecolor=color_codes['lcrw'], edgecolor='gray', label='Different from LCRW'),
    Patch(facecolor=color_codes['tribs'], edgecolor='gray', label='Different from other tributaries'),
    Patch(facecolor=color_codes['none'], edgecolor='gray', label='Not significantly different')
]
ax.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.18, 1), fontsize=9)

# Clean up ticks
ax.tick_params(axis='x', labelrotation=0, labelsize=10)
ax.tick_params(axis='y', labelsize=10)
plt.tight_layout()
plt.show()


# Spatial Analysis: relationships between ephemeral extent and... 

## ...land cover, 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df_copy = df.copy()
df_copy = df_copy.set_index('Catchment ID')
df_copy


In [None]:
# List of stats columns
stats_cols = ['Average Pi ephemeral',
 'Ephemeral Extent',
 'Relative ADNL-max',
 'Relative ADNL-min',
 'Relative ADNL-range',
 'Relative ADNL-mean',
 'Relative ADNL-median',
 'Relative ADNL-std' ]

# Extract land cover unit columns (everything except the stats)
landcover_unit_cols = [col for col in df_copy.columns if col not in stats_cols]

# Columns to analyze correlation for: geo units + stats
columns_to_analyze = landcover_unit_cols + stats_cols

# Calculate correlation matrix for selected columns
corr_matrix = df_copy[columns_to_analyze].corr()

# We want correlation of geo units (rows) against stats (columns)
corr_landcover_vs_stats = corr_matrix.loc[landcover_unit_cols, stats_cols]

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(
    corr_landcover_vs_stats,
    annot=True,
    cmap='coolwarm_r',   # Blue = positive, red = negative correlations
    center=0,
    fmt=".2f",
    linewidths=0.5,
    cbar_kws={'label': 'Correlation Coefficient'}
)
plt.title("Correlation between Geo Unit Percentages and ADNL Stats")
plt.xlabel("ADNL Statistics")
plt.ylabel("Land Cover")
plt.tight_layout()
plt.show()


## soil type, 

In [None]:
import pickle
soil_type = pickle.load(open("soil_type_distribution.pkl", 'rb'))
soil_type_T = soil_type.T
soil_type_T = soil_type_T.rename(index={
    'IBP': 'Tributary A',
    'CalUnion': 'Tributary B',
    'CherryCreek': 'Tributary C'
})

soil_type_T

In [None]:
#Merge df
combined_soil_df = df_copy.join(soil_type_T, how='left')
combined_soil_df

In [None]:
from scipy.stats import pearsonr

# Extract geo unit columns (everything except the stats)
soil_unit_cols = [col for col in combined_soil_df.columns if col not in stats_cols]

# Initialize matrices
corr_values = pd.DataFrame(index=soil_unit_cols, columns=stats_cols)
p_values = pd.DataFrame(index=soil_unit_cols, columns=stats_cols)

for soil_col in soil_unit_cols:
    for stat_col in stats_cols:
        soil_data = combined_soil_df[soil_col]
        stat_data = combined_soil_df[stat_col]
        
        # Skip if either is constant or has NaNs
        if soil_data.nunique() <= 1 or stat_data.nunique() <= 1:
            continue
        
        r, p = pearsonr(soil_data, stat_data)
        corr_values.loc[soil_col, stat_col] = r
        p_values.loc[soil_col, stat_col] = p

# Convert to numeric (optional cleanup step)
corr_values = corr_values.astype(float)
p_values = p_values.astype(float)

# Mask for statistical significance (e.g., p < 0.05)
significance_mask = p_values < 0.05

# # Plot only statistically significant correlations
# Step 1: Keep only rows and columns with at least one significant p-value
rows_to_keep = significance_mask.any(axis=1)
cols_to_keep = significance_mask.any(axis=0)

# Step 2: Filter all related dataframes
corr_values_sig = corr_values.loc[rows_to_keep, cols_to_keep]
p_values_sig = p_values.loc[rows_to_keep, cols_to_keep]
significance_mask_sig = significance_mask.loc[rows_to_keep, cols_to_keep]

# Step 3: Plot the filtered matrix
plt.figure(figsize=(12, 8))
sns.heatmap(
    corr_values_sig,
    annot=True,
    mask=~significance_mask_sig,
    cmap='coolwarm_r',
    center=0,
    fmt=".2f",
    linewidths=0.5,
    cbar_kws={'label': 'Correlation Coefficient'}
)
plt.title("Significant Correlations between Soil Units and ADNL Statistics (p < 0.05)")
plt.xlabel("ADNL Statistics")
plt.ylabel("Soil Unit Types")
plt.tight_layout()
plt.show()

## and surficial geology 

In [None]:
import pickle
geo_type = pickle.load(open("geological_distribution.pkl", 'rb'))
geo_type_T = geo_type.T
geo_type_T = geo_type_T.rename(index={
    'IBP': 'Tributary A',
    'CalUnion': 'Tributary B',
    'CherryCreek': 'Tributary C'
})

geo_type_T
#Merge df
combined_geo_df = df_copy.join(geo_type_T, how='left')
combined_geo_df


In [None]:
from scipy.stats import pearsonr

# Extract geo unit columns (everything except the stats)
geo_unit_cols = [col for col in combined_geo_df.columns if col not in stats_cols]

# Initialize matrices
corr_values = pd.DataFrame(index=geo_unit_cols, columns=stats_cols)
p_values = pd.DataFrame(index=geo_unit_cols, columns=stats_cols)

for geo_col in geo_unit_cols:
    for stat_col in stats_cols:
        geo_data = combined_geo_df[geo_col]
        stat_data = combined_geo_df[stat_col]
        
        # Skip if either is constant or has NaNs
        if geo_data.nunique() <= 1 or stat_data.nunique() <= 1:
            continue
        
        r, p = pearsonr(geo_data, stat_data)
        corr_values.loc[geo_col, stat_col] = r
        p_values.loc[geo_col, stat_col] = p

# Convert to numeric (optional cleanup step)
corr_values = corr_values.astype(float)
p_values = p_values.astype(float)

# Mask for statistical significance (e.g., p < 0.05)
significance_mask = p_values < 0.05

# # Plot only statistically significant correlations
# Step 1: Keep only rows and columns with at least one significant p-value
rows_to_keep = significance_mask.any(axis=1)
cols_to_keep = significance_mask.any(axis=0)

# Step 2: Filter all related dataframes
corr_values_sig = corr_values.loc[rows_to_keep, cols_to_keep]
p_values_sig = p_values.loc[rows_to_keep, cols_to_keep]
significance_mask_sig = significance_mask.loc[rows_to_keep, cols_to_keep]

# Step 3: Plot the filtered matrix
plt.figure(figsize=(12, 8))
sns.heatmap(
    corr_values_sig,
    annot=True,
    mask=~significance_mask_sig,
    cmap='coolwarm_r',
    center=0,
    fmt=".2f",
    linewidths=0.5,
    cbar_kws={'label': 'Correlation Coefficient'}
)
plt.title("Significant Correlations between Soil Units and ADNL Statistics (p < 0.05)")
plt.xlabel("ADNL Statistics")
plt.ylabel("Soil Unit Types")
plt.tight_layout()
plt.show()