In [1]:
import xarray as xr
import pandas as pd
import glob
import os
import numpy as np
import time
import logging
from shapely.geometry import Polygon, Point
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Set up logging for better debugging and monitoring progress
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Configuration (Your specified path) ---
folder_path = r"D:\Projects\hackaton\nasa\final_2 pm data\SWOT_SSH"
# ----------------------------------------

# --- Polygon Check Function ---
def is_point_in_polygon(lat, lon, polygon_coords):
    """Check if a given point (lat, lon) is inside the polygon."""
    polygon = Polygon(polygon_coords)
    point = Point(lon, lat)
    return polygon.contains(point)

# --- Season Filter Function ---
def filter_by_season(df, season):
    """Filter the DataFrame based on the season."""
    if season == 'summer':
        return df[df['time'].dt.month.isin([6, 7, 8])]
    elif season == 'winter':
        return df[df['time'].dt.month.isin([12, 1, 2])]
    else:
        logging.warning(f"Season {season} not recognized. Returning full dataset.")
        return df

# --- Validation and File Processing ---
def is_netcdf_file_valid(file_path):
    """Checks if a single NetCDF file is readable without errors."""
    try:
        with xr.open_dataset(file_path, engine='netcdf4', decode_timedelta=False) as ds:
            ds.close()
        return True
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        return False

def get_valid_files(folder_path):
    all_nc_files = glob.glob(os.path.join(folder_path, "*.nc"))
    logging.info(f"Starting validation for {len(all_nc_files)} files...")

    valid_nc_files = []
    start_time = time.time()
    for i, file in enumerate(all_nc_files):
        if is_netcdf_file_valid(file):
            valid_nc_files.append(file)
        if i % 100 == 0:  # Print progress every 100 files
            logging.info(f"Validated {i+1} files...")

    logging.info(f"Validation took {time.time() - start_time:.2f} seconds")
    logging.info(f"âœ… Found {len(valid_nc_files)} valid files.")
    return valid_nc_files

# --- STEP 1: Load Valid Files Safely ---
# Run the validation to get the file list
valid_nc_files = get_valid_files(folder_path) 
valid_nc_files.sort()  # Ensure chronological order

logging.info("\n1. Combining valid files using 'by_coords' (safe method)...")

# --- Safely open NetCDF files with error handling ---
try:
    # Use the more general and safer 'by_coords' method for merging time series data
    shark_data_ds = xr.open_mfdataset(
        valid_nc_files, 
        combine='by_coords',        # Recommended method for geospatial time series
        chunks={'time': 10},         # Increase chunk size slightly to speed up computation
        decode_timedelta=False,      # Suppress warnings
        parallel=False               # Disable parallelism for simplicity
    )
    logging.info("âœ… Datasets successfully loaded into a virtual Dask array.")
except Exception as e:
    logging.error(f"ðŸ›‘ FATAL ERROR during open_mfdataset: {e}")
    shark_data_ds = None

# Check if the loading was successful
if shark_data_ds is None:
    logging.error("\nðŸ›‘ Cannot proceed to ML training without the combined dataset.")
else:
    # ----------------------------------------------------------------------
    # --- STEP 2: ML DATA PREPARATION (Using the same simulation structure) ---
    # ----------------------------------------------------------------------

    logging.info("\n2. Simulating shark observation data (REPLACE WITH YOUR REAL DATA)...")
    
    # Simulate a small dataset of shark observations
    num_obs = 1000
    shark_observations_df = pd.DataFrame({
        'time': pd.to_datetime('2023-01-01') + pd.to_timedelta(np.random.randint(0, 365, num_obs), unit='D'),
        'lat': np.random.uniform(30, 35, num_obs),  # Target region from preview slice
        'lon': np.random.uniform(-80, -75, num_obs),
        'shark_present': np.random.randint(0, 2, num_obs) 
    })
    
    # --- Step 2.1: Filter by Polygon Input ---
    polygon_coords = [(30, -80), (35, -80), (35, -75), (30, -75)]  # Example polygon (lat, lon)
    logging.info(f"Filtering data using polygon with coordinates: {polygon_coords}")

    shark_observations_df = shark_observations_df[
        shark_observations_df.apply(lambda row: is_point_in_polygon(row['lat'], row['lon'], polygon_coords), axis=1)
    ]
    logging.info(f"Filtered data to {len(shark_observations_df)} points inside the polygon.")
    
    # --- Step 2.2: Filter by Season Input ---
    season = 'summer'  # Example season input
    shark_observations_df = filter_by_season(shark_observations_df, season)
    logging.info(f"Filtered data to {len(shark_observations_df)} points for season: {season}")
    
    # 2.1 Extract Features by Merging
    logging.info("3. Extracting features at observation points...")
    
    # Round coordinates for nearest neighbor matching
    obs_df = shark_observations_df.copy()
    obs_df['lat_rounded'] = obs_df['lat'].round(2)
    obs_df['lon_rounded'] = obs_df['lon'].round(2)

    # Use xarray's nearest neighbor selection
    merged_features_ds = shark_data_ds.sel(
        time=obs_df.time, 
        lat=obs_df.lat_rounded, 
        lon=obs_df.lon_rounded, 
        method='nearest'
    )
    
    # Convert the selected features to a DataFrame
    training_df = merged_features_ds.to_dataframe().reset_index()
    training_df = training_df.merge(
        obs_df.drop(columns=['lat_rounded', 'lon_rounded']),
        on=['time'], how='inner'
    )
    training_df = training_df.dropna(subset=['analyzed_sst'])  # Drop rows with missing 'analyzed_sst'

    logging.info("4. Final Training Data Head:")
    logging.info(f"{training_df.head()}")
    
    # ----------------------------------------------------------------------
    # --- STEP 3: TRAIN XGBOOST CLASSIFIER ---
    # ----------------------------------------------------------------------
    
    logging.info("\n5. Training XGBoost Classifier...")

    # Feature Engineering
    training_df['day_of_year'] = training_df['time'].dt.dayofyear
    training_df['month'] = training_df['time'].dt.month

    feature_cols = ['lat', 'lon', 'day_of_year', 'month', 'analyzed_sst', 'sea_ice_fraction']
    target_col = 'shark_present'

    # Check if 'sea_ice_fraction' column exists, if not, create it
    if 'sea_ice_fraction' not in training_df.columns:
        training_df['sea_ice_fraction'] = 0  # or np.nan, depending on how you'd handle missing data

    X = training_df[feature_cols]
    Y = training_df[target_col]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    xgb_model = XGBClassifier(
        objective='binary:logistic',
        n_estimators=100,
        learning_rate=0.1,
        use_label_encoder=False, 
        eval_metric='logloss',
        random_state=42
    )

    xgb_model.fit(X_train, Y_train)

    # ----------------------------------------------------------------------
    # --- STEP 4: EVALUATE MODEL ---
    # ----------------------------------------------------------------------

    Y_pred = xgb_model.predict(X_test)
   


2025-10-05 02:46:26,154 - INFO - Starting validation for 500 files...
2025-10-05 02:46:28,058 - INFO - Validated 1 files...
2025-10-05 02:48:08,680 - INFO - Validated 101 files...
2025-10-05 02:49:41,282 - INFO - Validated 201 files...
2025-10-05 02:51:09,664 - INFO - Validated 301 files...
2025-10-05 02:52:27,454 - INFO - Validated 401 files...
2025-10-05 02:53:46,309 - INFO - Validation took 440.15 seconds
2025-10-05 02:53:46,312 - INFO - âœ… Found 500 valid files.
2025-10-05 02:53:46,314 - INFO - 
1. Combining valid files using 'by_coords' (safe method)...
2025-10-05 03:03:00,006 - ERROR - ðŸ›‘ FATAL ERROR during open_mfdataset: Could not find any dimension coordinates to use to order the datasets for concatenation
2025-10-05 03:03:01,112 - ERROR - 
ðŸ›‘ Cannot proceed to ML training without the combined dataset.
