In [27]:
import xarray as xr
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.ndimage import label

# 1. Downloading the ERA5 precipitation daily statistics data from the ECMWF CDS. 
> **Note** While this code ran effectively, because it was a for loop it would not submit the next request until the previous one had returned the data. Because I got stuck in the CDS request queue, I manually submitted the data requests for roughly 1960-2025 by adjusting the dates and running the code block iteratively. 

```python
import cdsapi
import time
import os

client = cdsapi.Client(wait_until_complete=False)

dataset = "derived-era5-single-levels-daily-statistics"

for year in range(1940, 2025):  # inclusive of 2024
    print(f"Requesting data for {year}...")

    request = {
    "product_type": "reanalysis",
    "variable": [
        "sea_surface_temperature"
    ],
    "year": str(year),
    "month": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12"
    ],
    "day": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12",
        "13", "14", "15",
        "16", "17", "18",
        "19", "20", "21",
        "22", "23", "24",
        "25", "26", "27",
        "28", "29", "30",
        "31"
    ],
    "daily_statistic": "daily_mean",
    "time_zone": "utc-07:00",
    "frequency": "1_hourly",
    "area": [5, -170, -5, -120]
}

    filename = f"/Users/julianesler/Documents/Columbia/CWC_Research/data_download/Reanalysis/ERA5_sst_dat{year}.nc"
    client.retrieve(dataset, request).download(filename)


```python
# Define dataset and output path
dataset = "derived-era5-single-levels-daily-statistics"
outdir = "/Users/julianesler/Documents/Columbia/CWC_Research/data_download/Reanalysis/"
os.makedirs(outdir, exist_ok=True)

# Submit requests for 1969–2024
requests = {}
for year in range(1969, 2025):  # Skip 1940–1968
    print(f"Submitting request for {year}...")

    request_params = {
        "product_type": "reanalysis",
        "variable": ["sea_surface_temperature"],
        "year": str(year),
        "month": [f"{m:02d}" for m in range(1, 13)],
        "day": [f"{d:02d}" for d in range(1, 32)],
        "daily_statistic": "daily_mean",
        "time_zone": "utc-07:00",
        "frequency": "1_hourly",
        "area": [5, -170, -5, -120],
        "format": "netcdf",
    }

    filename = os.path.join(outdir, f"ERA5_sst_dat{year}.nc")
    result = client.retrieve(dataset, request_params)
    requests[year] = {"request": result, "filename": filename}

print(f"\n⏳ Tracking {len(requests)} asynchronous requests...")
sleep_interval = 60  # seconds between polling
remaining = set(requests.keys())

# 2. Concatenating the downloaded netcdfs into a single `.nc` file
> **Note** I will use the CDO command line tool for concatenating these files because it is fast and efficient.

```bash
!cdo mergetime $(seq 1940 2025 | sed 's|^|/Users/julianesler/Documents/Columbia/CWC_Research/signal-extraction/signal-extraction-data/Reanalysis/ERA5_sst_dat/|;s|$|.nc|') sst_day-stat.nc

# 3. Creating monthly mean dataset

In [28]:
sstdat = xr.open_dataset("../signal-extraction-data/Reanalysis/sst_day-stat.nc")
sstdat

In [None]:
sstdat = sstdat.assign_coords(
    month=sstdat.valid_time.dt.month,
    year=sstdat.valid_time.dt.year
)

sstmean = sstdat.groupby(['month','year']).mean(dim='valid_time')

In [40]:
sstmean['sst'].shape

(41, 201, 12, 86)

```python
sstmean.to_netcdf("../signal-extraction-data/Reanalysis/sst_mon-mean.nc")

In [32]:
print(sstmean)

<xarray.Dataset> Size: 34MB
Dimensions:    (latitude: 41, longitude: 201, month: 12, year: 86)
Coordinates:
  * longitude  (longitude) float64 2kB -170.0 -169.8 -169.5 ... -120.2 -120.0
  * latitude   (latitude) float64 328B 5.0 4.75 4.5 4.25 ... -4.5 -4.75 -5.0
  * month      (month) int64 96B 1 2 3 4 5 6 7 8 9 10 11 12
  * year       (year) int64 688B 1940 1941 1942 1943 ... 2022 2023 2024 2025
Data variables:
    sst        (latitude, longitude, month, year) float32 34MB 302.0 ... nan
Attributes:
    CDI:                     Climate Data Interface version 2.5.1 (https://mp...
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    history:                 Fri Jul 18 12:46:02 2025: cdo mergetime /Users/j...
    CDO:                     Climate Data Operators version 2.5.1 (https://mp...


In [42]:
# Assuming 'precip' is your xarray.Dataset with dimensions (latitude, longitude, year, month)
# Example: precip = xr.open_dataset('your_precip_data.nc')

# Combine the year and month into a single 'time' dimension
sstmean_flat = sstmean.sst.stack(time=("year", "month"))

# The new dimension 'time' will now have a length of 86 years * 12 months = 1032 time steps
print(sstmean_flat)

# Check the shape and dimensions after flattening
print(sstmean_flat.shape)  # Should be (latitude, longitude, time)

# You can then reshape the data to (time, spatial) if necessary
#precip_flat_reshaped = precip_flat.values.reshape(-1, precip_flat.shape[0] * precip_flat.shape[1])
#print(precip_flat_reshaped.shape)

<xarray.DataArray 'sst' (latitude: 41, longitude: 201, time: 1032)> Size: 34MB
array([[[301.96796, 301.76306, 301.8292 , ...,       nan,       nan,
               nan],
        [301.80792, 301.59283, 301.6504 , ...,       nan,       nan,
               nan],
        [301.63962, 301.4226 , 301.46548, ...,       nan,       nan,
               nan],
        ...,
        [300.57486, 300.90677, 301.7575 , ...,       nan,       nan,
               nan],
        [300.44327, 300.8384 , 301.66257, ...,       nan,       nan,
               nan],
        [300.29755, 300.75476, 301.55093, ...,       nan,       nan,
               nan]],

       [[301.86102, 301.62265, 301.6198 , ...,       nan,       nan,
               nan],
        [301.70673, 301.4561 , 301.4453 , ...,       nan,       nan,
               nan],
        [301.60437, 301.3393 , 301.35086, ...,       nan,       nan,
               nan],
...
        [299.05078, 299.89743, 301.23816, ...,       nan,       nan,
               nan],
  

In [44]:
sstmean_flat = sstmean_flat.reset_index('time')

In [46]:
print(sstmean_flat)

<xarray.DataArray 'sst' (latitude: 41, longitude: 201, time: 1032)> Size: 34MB
array([[[301.96796, 301.76306, 301.8292 , ...,       nan,       nan,
               nan],
        [301.80792, 301.59283, 301.6504 , ...,       nan,       nan,
               nan],
        [301.63962, 301.4226 , 301.46548, ...,       nan,       nan,
               nan],
        ...,
        [300.57486, 300.90677, 301.7575 , ...,       nan,       nan,
               nan],
        [300.44327, 300.8384 , 301.66257, ...,       nan,       nan,
               nan],
        [300.29755, 300.75476, 301.55093, ...,       nan,       nan,
               nan]],

       [[301.86102, 301.62265, 301.6198 , ...,       nan,       nan,
               nan],
        [301.70673, 301.4561 , 301.4453 , ...,       nan,       nan,
               nan],
        [301.60437, 301.3393 , 301.35086, ...,       nan,       nan,
               nan],
...
        [299.05078, 299.89743, 301.23816, ...,       nan,       nan,
               nan],
  

```python
sstmean_flat.to_netcdf("../signal-extraction-data/Reanalysis/sst_flat.nc")

In [22]:
sstat = xr.open_dataset("../signal-extraction-data/Reanalysis/sst_day-stat.nc")
sstat

In [33]:
sstmean = xr.open_dataset("../signal-extraction-data/Reanalysis/sst_mon-mean.nc")
sstmean

In [25]:
sstmean['sst'].max()