In [3]:
# -*- coding: utf-8 -*-
"""
Phase 1: Time-Series Feature Engineering for Coastal Wave Forecasting (Corrected v4.2)

This script performs comprehensive feature engineering on a causally-optimized,
multivariate time series of coastal data. It prepares the dataset for a
subsequent predictive modeling phase.

The pipeline includes:
1.  Setup and Data Loading from Google Drive, including a pivot from long to wide format.
2.  Efficient creation of lag, rolling window, cyclical, and interaction features.
3.  A corrected data splitting and cleaning workflow to prevent data leakage.
4.  A robust, multi-stage feature selection process optimized for speed and accuracy:
    a. Feature selection is performed *only* on the training/validation data.
    b. Importance-based pruning with LightGBM.
    c. Aggressive fast correlation-based pruning (threshold=0.90).
    d. Extremely Fast Final Selection using Scikit-learn's SelectFromModel.
5.  Creation and saving of the final engineered dataset.

Author: Your Name/Gemini
Date: 2025-07-26
"""

# =============================================================================
# Step 1: Setup and Environment Configuration
# =============================================================================
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel
from google.colab import drive

def setup_environment(project_root):
    """
    Mounts Google Drive and sets up project paths.
    """
    print("Mounting Google Drive...")
    try:
        drive.mount('/content/drive', force_remount=True)
        print("Google Drive mounted successfully.")
    except Exception as e:
        print(f"Error mounting Google Drive: {e}")
        return None, None

    input_data_path = os.path.join(project_root, 'Outputs/Predictor_Selection_v5_Physical/')
    # Update output folder to v3 to reflect the corrected methodology
    output_data_path = os.path.join(project_root, 'Outputs/Feature_Engineering_v1/')

    os.makedirs(output_data_path, exist_ok=True)
    print(f"Output directory created at: {output_data_path}")

    return input_data_path, output_data_path

def load_data(input_path, static_path):
    """
    Loads master time-series and static features. Reshapes the master dataset
    from long to wide format and engineers a slope feature.
    """
    print("\nLoading and reshaping datasets...")
    try:
        df_long = pd.read_csv(input_path, parse_dates=['time'], index_col='time')

        print("Pivoting master dataset from long to wide format...")
        variables_to_pivot = ['hm0', 'tp', 'mdir', 'windspeed', 'winddirection', 'ssh']
        existing_vars = [var for var in variables_to_pivot if var in df_long.columns]

        df_master = df_long.pivot_table(
            index='time', columns='point_id', values=existing_vars
        )
        df_master.columns = [f"{col[1]}_{col[0]}" for col in df_master.columns.values]
        df_master.columns = [col.replace('hm0', 'hs') for col in df_master.columns]

        df_static = pd.read_csv(static_path)

        if 'slope_ns' in df_static.columns and 'slope_ew' in df_static.columns:
            print("Calculating 'seabed_slope_to_target' from slope components.")
            df_static['seabed_slope_to_target'] = np.sqrt(
                df_static['slope_ns'].fillna(0)**2 + df_static['slope_ew'].fillna(0)**2
            )
        else:
            print("Warning: Slope components not found. Setting 'seabed_slope_to_target' to 0.")
            df_static['seabed_slope_to_target'] = 0

        print("Datasets loaded and reshaped successfully.")
        return df_master, df_static
    except FileNotFoundError as e:
        print(f"Error loading data: {e}. Please check your file paths.")
        return None, None

# =============================================================================
# Step 2: Feature Engineering Pipeline
# =============================================================================

def create_lag_features(df, lag_range=10):
    """Creates lag features for all predictor columns efficiently."""
    print("\nCreating lag features...")
    predictor_cols = [col for col in df.columns if col.startswith('offshore_')]
    lag_features = pd.concat(
        [df[col].shift(i).rename(f'{col}_lag_{i}h') for col in predictor_cols for i in range(1, lag_range + 1)],
        axis=1
    )
    return pd.concat([df, lag_features], axis=1)

def create_rolling_features(df, windows=[3, 6, 12, 24]):
    """Creates rolling window statistical features efficiently."""
    print("Creating rolling window features...")
    primary_predictors = [col for col in df.columns if '_hs' in col or '_tp' in col]
    rolling_features = []
    for col in primary_predictors:
        for window in windows:
            rolling_window = df[col].rolling(window=window, min_periods=1)
            rolling_features.append(rolling_window.mean().rename(f'{col}_roll_mean_{window}h'))
            rolling_features.append(rolling_window.std().rename(f'{col}_roll_std_{window}h'))
            rolling_features.append(rolling_window.min().rename(f'{col}_roll_min_{window}h'))
            rolling_features.append(rolling_window.max().rename(f'{col}_roll_max_{window}h'))
    return pd.concat([df] + rolling_features, axis=1)

def create_cyclical_features(df):
    """Creates cyclical features from the datetime index."""
    print("Creating cyclical time features...")
    df_cyclical = df.copy()
    df_cyclical['hour_sin'] = np.sin(2 * np.pi * df.index.hour / 23.0)
    df_cyclical['hour_cos'] = np.cos(2 * np.pi * df.index.hour / 23.0)
    df_cyclical['dayofyear_sin'] = np.sin(2 * np.pi * df.index.dayofyear / 365.0)
    df_cyclical['dayofyear_cos'] = np.cos(2 * np.pi * df.index.dayofyear / 365.0)
    return df_cyclical

def create_interaction_features(df, df_static):
    """Creates physics-informed interaction features."""
    print("Creating physics-informed interaction features...")
    df_interact = df.copy()
    offshore_locations = [pid for pid in df_static['point_id'].unique() if 'offshore' in pid]

    for loc_id in offshore_locations:
        static_row = df_static[df_static['point_id'] == loc_id]
        if not static_row.empty:
            depth = static_row['depth'].iloc[0]
            slope = static_row['seabed_slope_to_target'].iloc[0]
            hs_col, tp_col = f'{loc_id}_hs', f'{loc_id}_tp'
            if hs_col in df_interact.columns:
                df_interact[f'{loc_id}_hs_x_depth'] = df_interact[hs_col] * depth
                df_interact[f'{loc_id}_hs_x_seabed_slope'] = df_interact[hs_col] * slope
            if tp_col in df_interact.columns:
                df_interact[f'{loc_id}_tp_x_depth'] = df_interact[tp_col] * depth
                df_interact[f'{loc_id}_tp_x_seabed_slope'] = df_interact[tp_col] * slope
    return df_interact

def feature_engineering_pipeline(df_master, df_static):
    """Main function to run the entire feature engineering pipeline."""
    print("\n--- Starting Feature Engineering Pipeline ---")
    df_eng = create_lag_features(df_master)
    df_eng = create_rolling_features(df_eng)
    df_eng = create_cyclical_features(df_eng)
    df_eng = create_interaction_features(df_eng, df_static)
    print("--- Feature Engineering Pipeline Complete ---")
    return df_eng

# =============================================================================
# Step 3: Final Feature Selection (Optimized v4 - High Speed)
# =============================================================================

def select_features_by_correlation(X, threshold=0.90):
    """Removes highly correlated features as a fast pre-filtering step."""
    print(f"\n--- Starting Correlation-Based Feature Selection (Threshold={threshold}) ---")
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print(f"Found and removed {len(to_drop)} features based on high correlation.")
    return X.drop(columns=to_drop).columns.tolist()

def feature_selection_pipeline(X, y, correlation_threshold=0.90):
    """
    Runs the full, optimized feature selection pipeline using a fast, model-based final step.
    """
    print("\n--- Starting Full Feature Selection Pipeline ---")

    # 1. Importance-Based Pruning (Quick First Pass)
    lgbm = lgb.LGBMRegressor(random_state=42, force_col_wise=True)
    lgbm.fit(X, y)
    importances = pd.Series(lgbm.feature_importances_, index=X.columns)
    important_features = importances[importances > 0].index.tolist()
    print(f"Stage 1: Importance pruning removed {X.shape[1] - len(important_features)} features.")
    X_important = X[important_features]

    # 2. Fast Correlation-Based Pruning (Aggressive Bulk Removal)
    correlated_pruned_features = select_features_by_correlation(X_important, threshold=correlation_threshold)
    X_correlated_pruned = X_important[correlated_pruned_features]

    # 3. Final Selection using SelectFromModel (Extremely Fast & Robust)
    print(f"\nStage 3: Starting final model-based selection with {X_correlated_pruned.shape[1]} features remaining.")
    selector_model = lgb.LGBMRegressor(random_state=42, force_col_wise=True)
    selector = SelectFromModel(selector_model, prefit=False, threshold='median')
    selector.fit(X_correlated_pruned, y)
    final_features = X_correlated_pruned.columns[selector.get_support()].tolist()

    print(f"\nFinal number of selected features: {len(final_features)}")
    print("--- Full Feature Selection Pipeline Complete ---")
    return final_features

# =============================================================================
# Main Execution Block (Corrected Workflow)
# =============================================================================
if __name__ == '__main__':
    PROJECT_ROOT = '/content/drive/My Drive/Paper_3_New/'
    SPLIT_DATE = '2022-01-01'

    input_dir, output_dir = setup_environment(PROJECT_ROOT)

    if input_dir and output_dir:
        master_file_path = os.path.join(input_dir, 'master_dataset_causally_optimized_v2.csv')
        static_file_path = os.path.join(input_dir, 'static_features.csv')

        df_master, df_static = load_data(master_file_path, static_file_path)

        if df_master is not None and df_static is not None:
            # 1. Create all features (introduces NaNs)
            df_engineered = feature_engineering_pipeline(df_master, df_static)

            # 2. Add the split column BEFORE any data is dropped
            df_engineered['split'] = np.where(df_engineered.index >= pd.to_datetime(SPLIT_DATE), 'OOS', 'Train_Val')

            # 3. Prepare data for feature selection using ONLY Train_Val data
            df_train_val = df_engineered[df_engineered['split'] == 'Train_Val'].copy()

            # 4. Smart NaN Removal: Only drop rows in the training set where the target is missing
            # This is the essential cleaning step before training.
            initial_train_rows = df_train_val.shape[0]
            df_train_val.dropna(subset=['buoy_main_hs'], inplace=True)
            print(f"\nCleaning training data: Removed {initial_train_rows - df_train_val.shape[0]} rows with missing target.")

            target_cols = [col for col in df_train_val.columns if col.startswith('buoy_main_')]
            feature_cols = [col for col in df_train_val.columns if col not in target_cols and col != 'split']

            X_train = df_train_val[feature_cols]
            y_train = df_train_val['buoy_main_hs']

            # 5. Run the selection pipeline on the CLEANED training data only
            final_feature_list = feature_selection_pipeline(X_train, y_train)

            # 6. Create the final dataset using the selected features on the ENTIRE dataframe
            final_cols = final_feature_list + target_cols + ['split']
            df_final = df_engineered[final_cols].copy()
            # 7. Final Cleaning: Drop any remaining NaNs from the whole dataset
            # This primarily removes the initial rows in the OOS set affected by lags/rolling windows.
            initial_total_rows = df_final.shape[0]
            df_final.dropna(inplace=True)
            print(f"\nFinal cleaning: Removed {initial_total_rows - df_final.shape[0]} rows from the combined dataset.")

            print("\n--- Final Dataset Created ---")
            print(f"Shape of final dataset: {df_final.shape}")
            print("Value counts for the final 'split' column:")
            print(df_final['split'].value_counts())

            # 8. Save the final dataset
            output_file_path = os.path.join(output_dir, 'final_engineered_features_v3.csv')
            df_final.to_csv(output_file_path)

            print(f"\n✅ Success! Final engineered dataset saved to:\n{output_file_path}")


Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.
Output directory created at: /content/drive/My Drive/Paper_3_New/Outputs/Feature_Engineering_v1/

Loading and reshaping datasets...
Pivoting master dataset from long to wide format...
Calculating 'seabed_slope_to_target' from slope components.
Datasets loaded and reshaped successfully.

--- Starting Feature Engineering Pipeline ---

Creating lag features...
Creating rolling window features...
Creating cyclical time features...
Creating physics-informed interaction features...
--- Feature Engineering Pipeline Complete ---

Cleaning training data: Removed 1149 rows with missing target.

--- Starting Full Feature Selection Pipeline ---
[LightGBM] [Info] Total Bins 536557
[LightGBM] [Info] Number of data points in the train set: 10539, number of used features: 2114
[LightGBM] [Info] Start training from score 0.396068
Stage 1: Importance pruning removed 829 features.

--- Starting Correlation-Based Feature

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.dropna(inplace=True)



✅ Success! Final engineered dataset saved to:
/content/drive/My Drive/Paper_3_New/Outputs/Feature_Engineering_v1/final_engineered_features_v3.csv
