Authors: Negin Sobhani, Jakidxav

## Introduction

In this Notebook, I am going to show how we can read in data from a NetCDF file, subset for temporal and spatial indices, and then replace missing values in Python.

In [1]:
import netCDF4 as nc
import pickle

import numpy as np
import datetime as dt
import pandas as pd

import matplotlib.pylab as plt
import matplotlib.cm as cm
%matplotlib inline
import cmocean

In [2]:
sst_dir = '/Users/jakidxav/Desktop/Testing/'        

#starting and end years for loading in data
start_year = 1982
end_year   = 2015

#the days we care about; specified in mckinnon's paper
start_doy  = 175
end_doy    = 234

#cutoff for choosing whether the day was anomalously hot or not
cut_off = 6.5
     
#the region of interest
lat_lims = [20.,50.]
lon_lims = [145.,230.]

#choose a lead time
lead_time = 30

#where to save train, dev, and val sets
sst_train_list = []
sst_dev_list = []
sst_val_list = []

#dev/test set include 2 el nino years, 1 non-el nino years
#https://www.esrl.noaa.gov/psd/enso/past_events.html
dev_nino_list = [1983, 1990, 1995, 2008]
val_nino_list = [1988, 1994, 1999, 2003]

In [3]:
'''
* This functions import global daily sst data
* Select the time period of interest in a year
* Select the data between Lat and Lon range

Parameters
----------
    sst_dir
    yr,
    start_doy,
    end_doy,
    lead_time,
    lat_lims ---  [lat_min lat_max]
    lon_lims ---  [lon_min lon_mas]

Returns
-------

Example
-------
lat_lims = [20.,50.]
lon_lims = [145.,230.]


'''
def process_sst_data (sst_dir, yr, start_doy, end_doy, lead_time, lat_lims, lon_lims):
            
            #unpack variables from netcdf 
            sst_name = "sst.day.anom."+str(yr)+".nc"
            f = nc.Dataset(sst_dir+sst_name)
            anom = f.variables['anom'][:]
            lon  = f.variables['lon'][:]
            lat  = f.variables['lat'][:]
            dumb_time = f.variables['time'][:]

            #convert time to datetime
            time = pd.to_datetime(dumb_time, unit='D',
                       origin=pd.Timestamp('1800-01-01'))
            
            #get the day of the year variable
            jday = time.dayofyear

            #create indexers for subsetting lat/lon coordinates
            latidx1 = (lat >=lat_lims[0] ) & (lat <=lat_lims[1] )
            lonidx1 = (lon >=lon_lims[0] ) & (lon <=lon_lims[1] )

            timidx1  = (jday >= start_doy-lead_time)  & (jday <= end_doy-lead_time)

            #subset for latitude and longitude
            ocean_anom = anom[:, latidx1][..., lonidx1]
            
            #subset for time
            sst_year = ocean_anom[timidx1,:,:]

            return sst_year

In [None]:
# Read SST anomalies data for the selected years

# Process data year by year and append each year to specific list
#for counter, yr in enumerate(start_year, end_year+1):
for yr in range(start_year, end_year+1):
    
        print('------------------------------------------------------------------')
        line = "Processing SST data for Year : " + str(yr)
        print(line)
        
        sst_name = "sst.day.anom."+str(yr)+".nc"
        
        if (np.logical_or(yr in dev_nino_list, yr in val_nino_list)):
            if (yr in dev_nino_list):
                sst_year_dev = process_sst_data(sst_dir, yr, start_doy, end_doy, lead_time, lat_lims, lon_lims)
                sst_dev_list.append(sst_year_dev)
            else:
                sst_val_year = process_sst_data(sst_dir, yr, start_doy, end_doy, lead_time, lat_lims, lon_lims)
                sst_val_list.append(sst_val_year)
        else:
            sst_train_year = process_sst_data(sst_dir, yr, start_doy, end_doy, lead_time, lat_lims, lon_lims)
            sst_train_list.append(sst_train_year)

In [None]:
# # let's look at what one of our arrays look like
# plt.imshow(sst_train_list[0][0,:,:], cmap=cmocean.cm.balance)
# plt.colorbar()
# plt.show()

In [8]:
#we need to replace the missing values here
#do so for each the train, dev, and val sets
X_train = np.array(sst_train_list)
X_train[np.where(X_train < -1e+35)] = 0

X_dev = np.array(sst_dev_list)
X_dev[np.where(X_dev < -1e+35)] = 0

X_val = np.array(sst_val_list)
X_val[np.where(X_val < -1e+35)] = 0

-9.96921e+36
-7.7


In [78]:
#save arrays to disk
#change the filename based on what the leadtime is
X_train_filename = '/glade/work/jakidxav/IPython/X/{}_lead/X_train/X_train.txt'.format(leadtime)
with open(X_train_filename, 'wb') as f:
    pickle.dump(X_train, f)
    
X_dev_filename = '/glade/work/jakidxav/IPython/X/{}_lead/X_dev/X_dev.txt'.format(leadtime)
with open(X_dev_filename, 'wb') as g:
    pickle.dump(X_dev, g)
    
X_val_filename = '/glade/work/jakidxav/IPython/X/{}_lead/X_val/X_val.txt'.format(leadtime)
with open(X_val_filename, 'wb') as h:
    pickle.dump(X_val, h)