In [None]:
# ==============================================================================
# GPU-Accelerated Multivariate Hybrid Predictor Selection (v4 - Physical)
# ==============================================================================

# ------------------------------------------------------------------------------
# 1. Objective
# ------------------------------------------------------------------------------
# This notebook implements a scientifically robust, two-step hybrid methodology
# to select an optimal subset of 5 offshore predictor locations. This version
# is optimized for GPU execution and incorporates additional physical variables
# (Sea Surface Height 'ssh' and 'depth') to enhance the selection process.
#
# The goal is to identify the strongest causal drivers for four key wave and
# water level parameters (Hm0, Tp, Mdir, ssh) at two nearshore buoy locations.
#
# ------------------------------------------------------------------------------
# 2. Methodology
# ------------------------------------------------------------------------------
# The selection process is performed in two main stages:
#
# 1.  Physically-Weighted Correlational Filtering (GPU-Accelerated): We first
#     reduce the initial pool of 101 offshore points to the top 15 candidates.
#     This is done by calculating a new physically-weighted rank. This rank
#     combines a composite correlation score (based on Hm0, Tp, Mdir_u/v, and ssh)
#     with a depth-based score that prioritizes shallower locations.
#
# 2.  Multivariate Causal Selection (CPU-Bound): The 15 candidate points,
#     along with both buoys, are then analyzed using the state-of-the-art
#     PCMCI+ algorithm, now including 'ssh'. This step identifies offshore points
#     that are direct causal parents of the conditions at *either* buoy.
#     The final 5 predictors are selected based on the strength of these causal links.
# ==============================================================================

# =============================================================================
# STEP A: SETUP AND ENVIRONMENT CONFIGURATION
# =============================================================================
# This step installs and imports all necessary libraries and defines the
# project's file structure and global constants.

# A.1: Install necessary libraries
print("Installing required libraries...")
# The !git clone command may produce a 'fatal' error if the directory already exists. This is expected and can be ignored.
#!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
#!python rapidsai-csp-utils/colab/pip-install.py
#!pip install tigramite geopandas contextily -q
#print("Installation complete.")

# A.2: Import libraries
print("Importing core libraries...")
import os
import cudf
import cupy as cp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import contextily as cx
from google.colab import drive
from tqdm.notebook import tqdm
from sklearn.preprocessing import minmax_scale

# Import Tigramite for causal discovery
from tigramite import data_processing as pp
from tigramite.pcmci import PCMCI
from tigramite.independence_tests.parcorr import ParCorr
from tigramite.plotting import plot_graph

# A.3: Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# A.4: Define file paths and global constants
print("Defining file paths and global constants...")
BASE_PROJECT_DIR = '/content/drive/MyDrive/Paper_3_New'
OFFSHORE_DATA_DIR = os.path.join(BASE_PROJECT_DIR, 'Data', 'catboost_corrected')
BUOY_DATA_DIR = os.path.join(BASE_PROJECT_DIR, 'Data')
OUTPUT_DIR = os.path.join(BASE_PROJECT_DIR, 'Outputs', 'Predictor_Selection_v5_Physical')

MAIN_BUOY_FILE = os.path.join(BUOY_DATA_DIR, 'buoy_072_3hourly.csv')
OOS_BUOY_FILE = os.path.join(BUOY_DATA_DIR, 'buoy_103_3hourly.csv')

# --- Checkpoint File Definitions ---
CHECKPOINT_WIDE_DF = os.path.join(OUTPUT_DIR, 'checkpoint_wide_df_v4.parquet')
CHECKPOINT_CORR_CANDIDATES = os.path.join(OUTPUT_DIR, 'checkpoint_correlation_candidates_v4_physical.csv')
CHECKPOINT_PCMCI_RESULTS = os.path.join(OUTPUT_DIR, 'checkpoint_pcmci_results_v4.npz')

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output will be saved to: {OUTPUT_DIR}")

# --- Global Constants ---
# UPGRADED: Added 'ssh' to the list of variables for correlation and causal analysis.
ANALYSIS_VARS = ['hm0', 'tp', 'mdir_u', 'mdir_v', 'ssh']

# UPGRADED: Added 'ssh' and 'depth' to the columns to keep from offshore data.
OFFSHORE_COLS_TO_KEEP = [
    'time', 'lat', 'lon', 'depth', 'slope_ns', 'slope_ew', 'ssh',
    'Hm0_catboost', 'Tp_catboost', 'u_wind_catboost', 'v_wind_catboost',
    'u_curr_catboost', 'v_curr_catboost', 'u_mdir_catboost', 'v_mdir_catboost',
    'WindSpeed_catboost', 'WindDirection_catboost', 'CurrSpd_catboost',
    'CurrDir_catboost', 'VMDR_catboost'
]

MAX_LAG_HOURS = 24
TIME_RESOLUTION_HOURS = 3
MAX_LAG_STEPS = MAX_LAG_HOURS // TIME_RESOLUTION_HOURS
CANDIDATE_POOL_SIZE = 20
FINAL_PREDICTOR_COUNT = 5
PCMCI_ALPHA = 0.05
BUOY_DEPTH_METERS = 18.0

print("\nSTEP A COMPLETE.")


# ==============================================================================
# STEP B: ROBUST DATA LOADING & PREPARATION (GPU-ACCELERATED) - UPGRADED
# ==============================================================================
def convert_circular_to_vectors(df, direction_col, speed_col=None):
    """Converts circular direction data (in degrees) to u/v vector components."""
    # Ensure the direction column is numeric before processing
    direction_numeric = cudf.to_numeric(df[direction_col], errors='coerce')
    direction_rad = cp.deg2rad(direction_numeric)
    u_comp = -cp.sin(direction_rad)
    v_comp = -cp.cos(direction_rad)
    if speed_col:
        speed = df[speed_col]
        u_comp *= speed
        v_comp *= speed
    return u_comp, v_comp

def load_offshore_data_gpu(data_dir, cols_to_keep):
    """Loads and pre-processes all yearly offshore CSVs on the GPU."""
    print("    - Loading and concatenating yearly offshore data on GPU...")
    all_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]
    if not all_files: raise FileNotFoundError(f"No CSV files found in {data_dir}")
    gdf_list = [cudf.read_csv(f) for f in tqdm(all_files, desc="Loading offshore files")]
    master_df = cudf.concat(gdf_list, ignore_index=True)
    print("    - Standardizing offshore data...")
    master_df.columns = master_df.columns.str.lower()
    cols_to_keep_lower = [c.lower() for c in cols_to_keep]
    # Ensure we only select columns that actually exist in the dataframe
    existing_cols = [c for c in cols_to_keep_lower if c in master_df.columns]
    master_df = master_df[existing_cols]
    master_df.columns = master_df.columns.str.replace('_catboost', '')
    # Handle potential multiple renames safely
    rename_dict = {'vmdr': 'mdir'}
    master_df = master_df.rename(columns=rename_dict)
    master_df['time'] = cudf.to_datetime(master_df['time'])
    print("    - Converting circular variables to vectors...")
    master_df['mdir_u'], master_df['mdir_v'] = convert_circular_to_vectors(master_df, 'mdir')
    if 'winddirection' in master_df.columns and 'windspeed' in master_df.columns:
        master_df['winddir_u'], master_df['winddir_v'] = convert_circular_to_vectors(master_df, 'winddirection', 'windspeed')
    master_df['point_id'] = 'offshore_' + master_df.groupby(['lat', 'lon']).ngroup().astype(str)
    print(f"    - Loaded {master_df['point_id'].nunique()} unique offshore points.")
    return master_df

def hampel_filter_pandas(series, window_size=11, n_sigmas=3):
    """Hampel filter implementation using pandas for outlier detection."""
    rolling_median = series.rolling(window=window_size, center=True, min_periods=1).median()
    mad = lambda x: np.median(np.abs(x - np.median(x)))
    rolling_mad = series.rolling(window=window_size, center=True, min_periods=1).apply(mad, raw=True)
    threshold = n_sigmas * 1.4826 * rolling_mad
    outliers = np.abs(series - rolling_median) > threshold
    return outliers

def load_and_clean_buoy_data(filepath, buoy_id):
    """Loads, cleans, and pre-processes buoy data, adding placeholder physical columns."""
    print(f"    - Loading and cleaning buoy data for: {buoy_id}")
    df = cudf.read_csv(filepath)
    df.columns = df.columns.str.lower()
    df = df[['time', 'hm0', 'tp', 'mdir']].copy()
    df['time'] = cudf.to_datetime(df['time'])
    df = df.set_index('time').sort_index()

    full_index = cudf.date_range(start=df.index.min(), end=df.index.max(), freq=f'{TIME_RESOLUTION_HOURS}h')
    df = df.reindex(full_index)
    df.index.name = 'time'

    # UPGRADED: Add placeholder depth and ssh columns for buoys.
    df['depth'] = BUOY_DEPTH_METERS
    df['ssh'] = np.nan # No SSH data for buoys, explicitly set to NaN

    for col in ['hm0', 'tp', 'mdir']:
        series_cpu = df[col].to_pandas()
        if col in ['hm0', 'tp']: series_cpu.replace(0, np.nan, inplace=True)
        series_no_na = series_cpu.dropna()
        if not series_no_na.empty:
            outliers_mask = hampel_filter_pandas(series_no_na)
            series_cpu.loc[outliers_mask[outliers_mask].index] = np.nan
        df[col] = cudf.from_pandas(series_cpu)
        df[col] = df[col].interpolate(method='linear')

    print("    - Converting circular variables to vectors...")
    df['mdir_u'], df['mdir_v'] = convert_circular_to_vectors(df, 'mdir')

    df_final = df.reset_index()
    cols_to_rename = {col: f"{buoy_id}_{col}" for col in df_final.columns if col != 'time'}
    df_final = df_final.rename(columns=cols_to_rename)

    return df_final

print("STEP B: GPU-ACCELERATED DATA LOADING AND PREPARATION")

if os.path.exists(CHECKPOINT_WIDE_DF):
    print(f"  - Checkpoint found! Loading pre-processed wide DataFrame from {CHECKPOINT_WIDE_DF}")
    analysis_df = cudf.read_parquet(CHECKPOINT_WIDE_DF)
else:
    print("  - No checkpoint found. Running full data loading and pre-processing pipeline...")
    offshore_long_df = load_offshore_data_gpu(OFFSHORE_DATA_DIR, OFFSHORE_COLS_TO_KEEP)
    main_buoy_df = load_and_clean_buoy_data(MAIN_BUOY_FILE, 'buoy_main')
    oos_buoy_df = load_and_clean_buoy_data(OOS_BUOY_FILE, 'buoy_oos')

    print("    - Pivoting offshore data to wide format on GPU...")
    # Pivot original variables first, then create vector components
    pivot_vars = ['hm0', 'tp', 'mdir', 'ssh', 'depth']
    analysis_df = offshore_long_df.pivot_table(
        index='time',
        columns='point_id',
        values=pivot_vars
    )
    analysis_df.columns = [f"{col[1]}_{col[0]}" for col in analysis_df.columns]
    analysis_df = analysis_df.reset_index()

    # Re-calculate vector components on the wide dataframe
    print("    - Creating vector components on wide DataFrame...")
    offshore_point_ids = sorted(list(set(['_'.join(c.split('_')[:2]) for c in analysis_df.columns if c.startswith('offshore_')])))
    for pid in offshore_point_ids:
        if f'{pid}_mdir' in analysis_df.columns:
            analysis_df[f'{pid}_mdir_u'], analysis_df[f'{pid}_mdir_v'] = convert_circular_to_vectors(analysis_df, f'{pid}_mdir')

    print("    - Merging offshore and buoy data...")
    analysis_df = analysis_df.merge(main_buoy_df, on='time', how='inner')
    analysis_df = analysis_df.merge(oos_buoy_df, on='time', how='inner')
    analysis_df = analysis_df.set_index('time').sort_index()

    # Drop rows where ALL key target variables are missing to save space
    main_buoy_hm0 = [c for c in analysis_df.columns if 'buoy_main_hm0' in c]
    analysis_df = analysis_df.dropna(how='all', subset=main_buoy_hm0)

    print(f"    - Saving analysis DataFrame checkpoint to {CHECKPOINT_WIDE_DF}")
    analysis_df.to_parquet(CHECKPOINT_WIDE_DF)

print(f"  - Final analysis cuDF ready with shape: {analysis_df.shape}")
print("\nSTEP B COMPLETE.")

# ==============================================================================
# STEP C: PHYSICALLY-WEIGHTED CORRELATIONAL FILTERING (GPU-ACCELERATED)
# ==============================================================================
# This step identifies the top 15 candidate predictors. It calculates lagged
# cross-correlations for `Hm0`, `Tp`, `Mdir_u/v`, and `ssh` between each
# offshore point and the main buoy. A composite rank is then created.
# NOVELTY: This rank is combined with a physically-based depth score to create a
# final weighted rank, prioritizing points that are both highly correlated and
# in physically relevant (shallower) locations.

def calculate_lagged_corr_gpu(df, target_col, predictor_col, max_lag_steps):
    """Calculates the best lagged cross-correlation for a single predictor-target pair."""
    max_abs_corr = -1.0
    # Create shifted versions of the target column at once for efficiency
    shifted_targets = {lag: df[target_col].shift(-lag) for lag in range(max_lag_steps + 1)}

    for lag, shifted_target in shifted_targets.items():
        # Calculate correlation on valid (non-NA) pairs
        corr = df[predictor_col].corr(shifted_target)
        if not pd.isna(corr) and abs(corr) > max_abs_corr:
            max_abs_corr = abs(corr)

    return max_abs_corr if max_abs_corr != -1.0 else 0.0

def get_physically_weighted_ranks_gpu(df, offshore_ids, target_buoy_id, analysis_vars, max_lag_steps):
    """Calculates correlation ranks, integrates a depth-based physical score, and returns a final ranked DataFrame."""
    all_results = []
    # Using pandas for this part due to complex ranking and merging logic
    df_pd = df.to_pandas()

    for var in tqdm(analysis_vars, desc="Processing Variables"):
        var_results = []
        target_col = f"{target_buoy_id}_{var}"
        if target_col not in df_pd.columns:
            print(f"Warning: Target column {target_col} not found. Skipping variable {var}.")
            continue

        for point_id in tqdm(offshore_ids, desc=f"Correlations for {var}", leave=False):
            predictor_col = f"{point_id}_{var}"
            if predictor_col in df_pd.columns:
                # Use a temporary df for each pair to handle NaNs correctly
                pair_df = df_pd[[predictor_col, target_col]].dropna()
                if len(pair_df) > max_lag_steps * 2:
                    corr = calculate_lagged_corr_gpu(pair_df, target_col, predictor_col, max_lag_steps)
                    var_results.append({'point_id': point_id, 'correlation': corr})

        if var_results:
            var_df = pd.DataFrame(var_results)
            var_df[f'rank_{var}'] = var_df['correlation'].rank(ascending=False, method='min')
            all_results.append(var_df[['point_id', f'rank_{var}']])

    if not all_results:
        raise ValueError("Correlation analysis yielded no results. Check input data and column names.")

    final_ranks_df = all_results[0]
    for i in range(1, len(all_results)):
        final_ranks_df = pd.merge(final_ranks_df, all_results[i], on='point_id')

    rank_cols = [col for col in final_ranks_df.columns if 'rank_' in col]
    final_ranks_df['total_rank'] = final_ranks_df[rank_cols].sum(axis=1)

    # --- NOVELTY: Incorporate Physical Weighting by Depth ---
    print("    - Incorporating physical weighting based on depth...")
    depth_data = []
    for pid in offshore_ids:
        depth_col = f"{pid}_depth"
        if depth_col in df_pd.columns:
            depth_val = df_pd[depth_col].median() # Use median to get a robust static value
            if not pd.isna(depth_val):
                depth_data.append({'point_id': pid, 'depth': depth_val})

    depth_df = pd.DataFrame(depth_data)
    # Shallower points get a better rank (lower is better), so we rank depth ascendingly
    depth_df['depth_rank'] = depth_df['depth'].rank(ascending=True, method='min')

    final_ranks_df = pd.merge(final_ranks_df, depth_df[['point_id', 'depth', 'depth_rank']], on='point_id')
    final_ranks_df['physically_weighted_rank'] = final_ranks_df['total_rank'] + final_ranks_df['depth_rank']

    return final_ranks_df.sort_values('physically_weighted_rank').reset_index(drop=True)

print("STEP C: PHYSICALLY-WEIGHTED CORRELATIONAL FILTERING")

if os.path.exists(CHECKPOINT_CORR_CANDIDATES):
    print(f"  - Checkpoint found! Loading correlation candidates from {CHECKPOINT_CORR_CANDIDATES}")
    candidates_df = pd.read_csv(CHECKPOINT_CORR_CANDIDATES)
    candidate_predictors = candidates_df['point_id'].head(CANDIDATE_POOL_SIZE).tolist()
else:
    print("  - No checkpoint found. Running GPU-accelerated multivariate correlation ranking...")
    offshore_point_ids = sorted(list(set(['_'.join(c.split('_')[:2]) for c in analysis_df.columns if c.startswith('offshore_')])))

    ranked_df = get_physically_weighted_ranks_gpu(
        analysis_df,
        offshore_ids=offshore_point_ids,
        target_buoy_id='buoy_main',
        analysis_vars=ANALYSIS_VARS,
        max_lag_steps=MAX_LAG_STEPS
    )

    print(f"    - Saving correlation candidates checkpoint to {CHECKPOINT_CORR_CANDIDATES}")
    ranked_df.to_csv(CHECKPOINT_CORR_CANDIDATES, index=False)
    candidate_predictors = ranked_df.head(CANDIDATE_POOL_SIZE)['point_id'].tolist()

print(f"  - Identified top {CANDIDATE_POOL_SIZE} candidate predictors based on physically-weighted rank.")
print("  - Candidates:", candidate_predictors)

print("\nTop 5 Candidates from physically-weighted ranking:")
display(pd.read_csv(CHECKPOINT_CORR_CANDIDATES).head())
print("\nSTEP C COMPLETE.")


# ==============================================================================
# STEP D: MULTIVARIATE CAUSAL SELECTION (CPU-BOUND) - UPGRADED
# ==============================================================================
# This step constructs a large multivariate system including the 15 candidate
# predictors and both buoys. The PCMCI+ algorithm, now including 'ssh', is run to
# discover direct causal links. The final 5 predictors are selected based on the
# strength of these links to *either* the main or OOS buoy.

print("STEP D: MULTIVARIATE CAUSAL SELECTION (CPU-BOUND)")

# --- FIX: Build the list of columns for causal analysis more carefully ---

# Define the variables available for the buoys (i.e., everything EXCEPT ssh)
buoy_analysis_vars = [v for v in ANALYSIS_VARS if v != 'ssh']

causal_candidate_cols = []
# Add variables for the 15 candidate predictors (including their ssh)
for p_id in candidate_predictors:
    for var in ANALYSIS_VARS:
      col_name = f"{p_id}_{var}"
      if col_name in analysis_df.columns:
          causal_candidate_cols.append(col_name)

# Add variables for both buoys, using the list that excludes ssh
for b_id in ['buoy_main', 'buoy_oos']:
    for var in buoy_analysis_vars:
        col_name = f"{b_id}_{var}"
        if col_name in analysis_df.columns:
            causal_candidate_cols.append(col_name)

# Transfer data to CPU and create pandas DataFrame, interpolating to handle NaNs
# This will no longer receive all-NaN columns, preventing the error.
causal_df_pandas = analysis_df[causal_candidate_cols].to_pandas().interpolate(method='linear').fillna(method='bfill').fillna(method='ffill')
var_names = causal_df_pandas.columns.tolist()

# Add a defensive check to ensure no NaNs are left
if causal_df_pandas.isnull().values.any():
    raise ValueError("NaNs still exist in the data before passing to Tigramite. Check data cleaning steps.")

if os.path.exists(CHECKPOINT_PCMCI_RESULTS):
    print(f"  - Checkpoint found! Loading PCMCI+ results from {CHECKPOINT_PCMCI_RESULTS}")
    results_data = np.load(CHECKPOINT_PCMCI_RESULTS, allow_pickle=True)
    results = results_data['results'].item()
else:
    print("  - No checkpoint found. Running PCMCI+ algorithm on CPU... (this may take several hours)")
    dataframe = pp.DataFrame(causal_df_pandas.values, var_names=var_names)
    parcorr = ParCorr(significance='analytic')
    pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1)

    results = pcmci.run_pcmciplus(tau_min=1, tau_max=MAX_LAG_STEPS, pc_alpha=PCMCI_ALPHA)

    print(f"    - Saving PCMCI+ results checkpoint to {CHECKPOINT_PCMCI_RESULTS}")
    np.savez_compressed(CHECKPOINT_PCMCI_RESULTS, results=results)

# --- Extract and Rank Causal Parents ---
print("  - Extracting causal parents of BOTH buoys from the graph...")
causal_graph = results['graph']
val_matrix = results['val_matrix']
causal_links = []

target_indices = [i for i, name in enumerate(var_names) if name.startswith('buoy_')]

for predictor_idx, predictor_name in enumerate(var_names):
    if predictor_name.startswith('offshore_'):
        for target_idx in target_indices:
            # Check for links from predictor to target at all positive lags
            for lag in range(1, causal_graph.shape[2]):
                if causal_graph[predictor_idx, target_idx, lag] == '-->':
                    causal_links.append({
                        'predictor_id': '_'.join(predictor_name.split('_')[:2]),
                        'link_strength': abs(val_matrix[predictor_idx, target_idx, lag])
                    })

if not causal_links:
    print("  - WARNING: No causal links found. Falling back to physically-weighted correlation candidates.")
    final_predictor_ids = candidate_predictors[:FINAL_PREDICTOR_COUNT]
else:
    # Rank predictors by their strongest causal link to any buoy variable
    causal_df = pd.DataFrame(causal_links)
    ranked_causal_predictors = causal_df.groupby('predictor_id')['link_strength'].max().sort_values(ascending=False)
    final_predictor_ids = ranked_causal_predictors.head(FINAL_PREDICTOR_COUNT).index.tolist()

print(f"  - Identified top {len(final_predictor_ids)} final predictors with direct causal links.")
print("  - Final Predictors:", final_predictor_ids)
print("\nSTEP D COMPLETE.")

# ==============================================================================
# STEP E: REPORTING, VISUALIZATION, AND FINAL DATASET CREATION - UPGRADED
# ==============================================================================
import networkx as nx

def plot_summary_causal_graph(results, var_names, final_predictors, candidate_predictors, save_path):
    """Creates a high-level, publication-quality causal graph."""
    print("  - Generating enhanced summary causal graph...")
    buoy_nodes = ['buoy_main', 'buoy_oos']
    all_nodes = sorted(list(set(candidate_predictors))) + buoy_nodes
    G = nx.DiGraph()
    G.add_nodes_from(all_nodes)

    causal_links = {}
    graph_matrix, val_matrix = results['graph'], results['val_matrix']

    for i, p_name in enumerate(var_names):
        if not p_name.startswith('offshore_'): continue
        from_node = '_'.join(p_name.split('_')[:2])
        for j, t_name in enumerate(var_names):
            if not t_name.startswith('buoy_'): continue
            to_node = '_'.join(t_name.split('_')[:2])

            # Find the max link strength across all lags for this pair of points
            max_strength = 0
            for lag in range(1, graph_matrix.shape[2]):
                if graph_matrix[i, j, lag] == '-->':
                    max_strength = max(max_strength, abs(val_matrix[i, j, lag]))

            # Store the link if it's stronger than any other variable link between these two points
            if max_strength > 0 and max_strength > causal_links.get((from_node, to_node), 0):
                causal_links[(from_node, to_node)] = max_strength

    for (u, v), w in causal_links.items(): G.add_edge(u, v, weight=w)

    plt.style.use('seaborn-v0_8-talk')
    fig, ax = plt.subplots(figsize=(16, 16))
    pos = nx.circular_layout(G)

    # Define node properties
    node_colors = []
    node_sizes = []
    for node in G.nodes():
        if node in buoy_nodes:
            node_colors.append('gold' if node == 'buoy_main' else 'skyblue')
            node_sizes.append(8000)
        elif node in final_predictors:
            node_colors.append('limegreen')
            node_sizes.append(6000)
        else: # Candidate but not selected
            node_colors.append('lightgrey')
            node_sizes.append(4000)

    edge_weights = [d['weight'] * 15 for u, v, d in G.edges(data=True)]

    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, edgecolors='black')
    nx.draw_networkx_labels(G, pos, font_size=12, font_weight='bold')
    nx.draw_networkx_edges(G, pos, width=edge_weights, edge_color='darkred', alpha=0.7, arrows=True, arrowstyle='-|>', arrowsize=30, connectionstyle='arc3,rad=0.05')
    ax.set_title("Summary of Causal Links from Offshore Points to Buoys (SSH Included)", fontsize=24, pad=20)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close(fig)
    print(f"  - Enhanced causal graph saved to {save_path}")

print("STEP E: REPORTING, VISUALIZATION, AND FINAL DATASET CREATION")

# E.1: Generate Predictor Map
print("  - Generating predictor map...")
if 'offshore_long_df' not in locals():
     offshore_long_df = load_offshore_data_gpu(OFFSHORE_DATA_DIR, OFFSHORE_COLS_TO_KEEP)

locations_df = offshore_long_df[['point_id', 'lat', 'lon']].drop_duplicates().to_pandas()
gdf = gpd.GeoDataFrame(locations_df, geometry=gpd.points_from_xy(locations_df.lon, locations_df.lat), crs="EPSG:4326")
buoy_locations = {'point_id': ['buoy_main', 'buoy_oos'], 'lat': [24.513, 24.535], 'lon': [56.647, 56.629]}
buoy_gdf = gpd.GeoDataFrame(pd.DataFrame(buoy_locations), geometry=gpd.points_from_xy(buoy_locations['lon'], buoy_locations['lat']), crs="EPSG:4326")
full_gdf = pd.concat([gdf, buoy_gdf], ignore_index=True)
full_gdf['category'] = full_gdf['point_id'].apply(lambda pid: 'Main Target Buoy' if 'buoy_main' in pid else 'OOS Target Buoy' if 'buoy_oos' in pid else 'Final Selected Predictor' if pid in final_predictor_ids else 'Candidate Predictor' if pid in candidate_predictors else 'Not Selected')
gdf_mercator = full_gdf.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(12, 12))
cat_style = {
    'Main Target Buoy': {'color': 'gold', 'markersize': 400, 'marker': '*', 'edgecolor': 'black', 'zorder': 10},
    'OOS Target Buoy': {'color': 'skyblue', 'markersize': 400, 'marker': '*', 'edgecolor': 'black', 'zorder': 10},
    'Final Selected Predictor': {'color': 'limegreen', 'markersize': 200, 'marker': 'o', 'edgecolor': 'black', 'zorder': 9},
    'Candidate Predictor': {'color': 'grey', 'markersize': 50, 'marker': 'o', 'alpha': 0.7, 'zorder': 8},
    'Not Selected': {'color': 'tomato', 'markersize': 15, 'marker': '.', 'alpha': 0.5, 'zorder': 7}
}
for cat, data in gdf_mercator.groupby('category'):
    if cat != 'Not Selected': # Optionally hide non-candidate points to reduce clutter
      data.plot(ax=ax, label=cat, **cat_style[cat])

cx.add_basemap(ax, source=cx.providers.Esri.OceanBasemap, zoom=9)
ax.set_title(f'Final {FINAL_PREDICTOR_COUNT} Physically & Causally Selected Predictors', fontsize=18)
ax.set_axis_off()
ax.legend(title='Point Category', loc='lower right', frameon=True, facecolor='lightgray')
map_path = os.path.join(OUTPUT_DIR, 'predictor_map_v4.png')
plt.savefig(map_path, dpi=300, bbox_inches='tight')
plt.close(fig)
print(f"  - Predictor map saved to {map_path}")

# E.2: Generate Enhanced Causal Graph Plot
graph_path = os.path.join(OUTPUT_DIR, 'causal_graph_summary_v4.png')
plot_summary_causal_graph(results, var_names, final_predictor_ids, candidate_predictors, graph_path)

# E.3: Save final predictor IDs
print("  - Saving final predictor IDs...")
pd.DataFrame({'point_id': final_predictor_ids}).to_csv(os.path.join(OUTPUT_DIR, 'final_predictor_ids_v4.csv'), index=False)
print(f"  - Final predictor IDs saved.")

# E.4: Create and save the final master dataset (UPGRADED)
print("  - Creating and saving the final, optimized master dataset...")
final_offshore_df = offshore_long_df[offshore_long_df['point_id'].isin(final_predictor_ids)]

buoys_list = []
for file, pid, lat, lon in [(MAIN_BUOY_FILE, 'buoy_main', 24.513, 56.647), (OOS_BUOY_FILE, 'buoy_oos', 24.535, 56.629)]:
    df = cudf.read_csv(file)
    df.columns = df.columns.str.lower()
    df['point_id'], df['lat'], df['lon'] = pid, lat, lon
    df['depth'] = BUOY_DEPTH_METERS # Add static depth
    df['ssh'] = cudf.Series(cp.full(len(df), np.nan), dtype='float64') # Add placeholder ssh
    buoys_list.append(df)

all_buoys_df = cudf.concat(buoys_list)

# UPGRADED: Add 'depth' and 'ssh' to the final selection of columns
final_cols_to_select = ['time', 'point_id', 'lat', 'lon', 'hm0', 'tp', 'mdir', 'windspeed', 'winddirection', 'depth', 'ssh']

# Ensure columns exist before trying to select them
final_offshore_cols = [c for c in final_cols_to_select if c in final_offshore_df.columns]
all_buoys_cols = [c for c in final_cols_to_select if c in all_buoys_df.columns]
final_offshore_subset = final_offshore_df[final_offshore_cols]
all_buoys_subset = all_buoys_df[all_buoys_cols]

final_master_dataset = cudf.concat([final_offshore_subset, all_buoys_subset], ignore_index=True)
final_master_dataset['time'] = cudf.to_datetime(final_master_dataset['time'])
final_master_dataset = final_master_dataset.sort_values(by=['point_id', 'time']).reset_index(drop=True)

master_path = os.path.join(OUTPUT_DIR, 'master_dataset_causally_optimized_v2.csv')
final_master_dataset.to_pandas().to_csv(master_path, index=False, date_format='%Y-%m-%d %H:%M:%S')
print(f"  - Final master dataset saved to {master_path}. Shape: {final_master_dataset.shape}")

print("\nSTEP E COMPLETE.")
print("\n========================================================")
print(" SCRIPT COMPLETE")
print("========================================================")

Installing required libraries...
fatal: destination path 'rapidsai-csp-utils' already exists and is not an empty directory.
Installing RAPIDS remaining 25.04 libraries
Using Python 3.11.13 environment at: /usr
Resolved 175 packages in 9.38s
Downloading cucim-cu12 (5.6MiB)
Downloading rmm-cu12 (1.5MiB)
Downloading ucx-py-cu12 (2.2MiB)
Downloading libcuspatial-cu12 (31.1MiB)
Downloading libcuvs-cu12 (1.1GiB)
Downloading pylibcudf-cu12 (26.4MiB)
Downloading nvidia-nvcomp-cu12 (44.1MiB)
Downloading librmm-cu12 (2.9MiB)
Downloading libcudf-cu12 (538.8MiB)
Downloading cudf-cu12 (1.7MiB)
Downloading libcugraph-cu12 (1.4GiB)
Downloading cuspatial-cu12 (4.1MiB)
Downloading cuml-cu12 (9.4MiB)
Downloading raft-dask-cu12 (274.9MiB)
Downloading pylibcugraph-cu12 (2.0MiB)
Downloading cuproj-cu12 (1.1MiB)
Downloading libkvikio-cu12 (2.0MiB)
Downloading libcuml-cu12 (404.9MiB)
Downloading libraft-cu12 (20.8MiB)
Downloading cugraph-cu12 (3.0MiB)
 Downloaded cuproj-cu12
 Downloaded ucx-py-cu12
 Download

Loading offshore files:   0%|          | 0/7 [00:00<?, ?it/s]

    - Standardizing offshore data...
    - Converting circular variables to vectors...
    - Loaded 101 unique offshore points.
    - Loading and cleaning buoy data for: buoy_main
    - Converting circular variables to vectors...
    - Loading and cleaning buoy data for: buoy_oos
    - Converting circular variables to vectors...
    - Pivoting offshore data to wide format on GPU...
    - Creating vector components on wide DataFrame...
    - Merging offshore and buoy data...
    - Saving analysis DataFrame checkpoint to /content/drive/MyDrive/Paper_3_New/Outputs/Predictor_Selection_v5_Physical/checkpoint_wide_df_v4.parquet
  - Final analysis cuDF ready with shape: (5601, 721)

STEP B COMPLETE.
STEP C: PHYSICALLY-WEIGHTED CORRELATIONAL FILTERING
  - No checkpoint found. Running GPU-accelerated multivariate correlation ranking...


Processing Variables:   0%|          | 0/5 [00:00<?, ?it/s]

Correlations for hm0:   0%|          | 0/101 [00:00<?, ?it/s]

Correlations for tp:   0%|          | 0/101 [00:00<?, ?it/s]

Correlations for mdir_u:   0%|          | 0/101 [00:00<?, ?it/s]

Correlations for mdir_v:   0%|          | 0/101 [00:00<?, ?it/s]

Correlations for ssh:   0%|          | 0/101 [00:00<?, ?it/s]

    - Incorporating physical weighting based on depth...
    - Saving correlation candidates checkpoint to /content/drive/MyDrive/Paper_3_New/Outputs/Predictor_Selection_v5_Physical/checkpoint_correlation_candidates_v4_physical.csv
  - Identified top 20 candidate predictors based on physically-weighted rank.
  - Candidates: ['offshore_24', 'offshore_34', 'offshore_56', 'offshore_45', 'offshore_35', 'offshore_25', 'offshore_16', 'offshore_57', 'offshore_26', 'offshore_46', 'offshore_36', 'offshore_68', 'offshore_17', 'offshore_79', 'offshore_58', 'offshore_27', 'offshore_69', 'offshore_80', 'offshore_81', 'offshore_18']

Top 5 Candidates from physically-weighted ranking:


Unnamed: 0,point_id,rank_hm0,rank_tp,rank_mdir_u,rank_mdir_v,total_rank,depth,depth_rank,physically_weighted_rank
0,offshore_24,2.0,3.0,2.0,2.0,9.0,21.0,8.0,17.0
1,offshore_34,1.0,4.0,1.0,1.0,7.0,28.0,11.0,18.0
2,offshore_56,9.0,14.0,3.0,3.0,29.0,14.0,5.0,34.0
3,offshore_45,3.0,15.0,4.0,4.0,26.0,91.0,22.0,48.0
4,offshore_35,5.0,10.0,6.0,10.0,31.0,71.0,18.0,49.0



STEP C COMPLETE.
STEP D: MULTIVARIATE CAUSAL SELECTION (CPU-BOUND)
  - No checkpoint found. Running PCMCI+ algorithm on CPU... (this may take several hours)


  causal_df_pandas = analysis_df[causal_candidate_cols].to_pandas().interpolate(method='linear').fillna(method='bfill').fillna(method='ffill')



##
## Step 1: PC1 algorithm for selecting lagged conditions
##

Parameters:
independence test = par_corr
tau_min = 1
tau_max = 8
pc_alpha = [0.05]
max_conds_dim = None
max_combinations = 1


