Authors: Jakidxav, Negin Sobhani
    
This Notebook creates our data labels for a given (singular) station. First, we read in the data and subset for the correct years and calendar days. Then we create a temperature anomaly column, and create labels based on whether this anomaly is greater than or equal to a cutoff value that we have specified. We can do this for the training, development, and validation sets all in one go.

In [0]:
import netCDF4 as nc  
import numpy as np
import datetime
import pandas as pd
import datetime as dt

import matplotlib.pylab as plt
%matplotlib inline

import pickle

In [0]:
#directory where station csv files are held
ghcnd_csv_dir = '/glade/work/ddvento/ML/McKinnon_data/ghcnd/ghcnd_all_csv/'

#starting and ending years in our analysis
start_year = 1982
end_year   = 2015
     
#days that mckinnon used in her paper
start_doy  = 175
end_doy    = 234
        
#temperature anomaly which classifies a day as hot or not-hot
cut_off = 6.5

#found at: https://www.ncdc.noaa.gov/
station_id = 'USC00391621' #station10 chamberlain 5 s, south dakota (136)

#years to subset our data for the development and validation sets
#everything else will go into the training set
dev_nino_list = [1983, 1990, 1995, 2008]
val_nino_list = [1988, 1994, 1999, 2003]

In [0]:
'''
get_ghcnd_stn:
    *Opens the csv file for the GHCND station as df
    *Add a pd.datetime column to the df
    *Add Julian Day (day of the year) jday to the df 
    *Selects data for only the training years
    *Selects data for only the selected days of a year ( e.g. 60 days of summer.) 

----------
Parameters:
    ghcnd_csv_dir --- path to GHCN processed csv files.
    stn_id --- GHCN station ID based on GHCN readme.txt
        (ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt)

    start_year --- start year for the period to slice. 
    end_year 

    start_doy  --- start julian day (day of year) for the period
                    you are looking at.  
    end_doy
-------
Returns:
    stn_data --- 

-------
Example:
    stn_data = get_ghcnd_stn(ghcnd_csv_dir,stn_id,1982,
                2015, 30, 90)

'''
def get_ghcnd_stn (ghcnd_csv_dir, stn_id, start_year, end_year, start_doy, end_doy):
        #create station label so that we can read in the file using pandas
        stn_csv = ghcnd_csv_dir+stn_id+'.csv'
        
        #read in file, replace nan values
        stn_raw = pd.read_csv(stn_csv,na_values=-9999)
            
        #convert dates into datetime objects so that we can extract the day of the year
        stn_raw['date']=pd.to_datetime(stn_raw['YYYY'].astype(str)+'-'+stn_raw['MM'].astype(str)+'-'+stn_raw['DD'].astype(str))
        stn_raw['jday'] = stn_raw['date'].dt.dayofyear.apply(lambda x: str(x).zfill(3)).astype(int)

        #subset data based on years and calendar days
        yrs_data = stn_raw[(stn_raw['YYYY']>=start_year) & (stn_raw['YYYY']<=end_year)]
        stn_data= yrs_data[(yrs_data['jday']>=start_doy) & (yrs_data['jday']<=end_doy)]
            
        return stn_data

In [0]:
'''
calc_stn_anom :
    *Calculates the anomalies of selected var for the station.
----------
Parameters:
    stn_data ---
    var --- Name of the varibale to calculate anomalies on :
            e.g. TMAX, TMIN, PRCP
-------
Returns:
    stn_data --- 

-------
Example:
    calc_stn_anom (stn_data, 'TMAX')
    '''
def calc_stn_anom (stn_data, var):
        #create an anomaly column variable in our dataframe
        var_anom= var+"ANOM"
        
        #create mean of a given variable by the .groupby() method and applying a mean transform
        means=stn_data.groupby(['MM','DD'])[var].transform('mean')
            
        #create anomaly by subtracting mean
        stn_data[var_anom]= stn_data[var] - means
            
        return stn_data

In [0]:
'''
* Find the hot days (or extreme events) based on a cut off value.
* Store this flag ('HOT') as a column in the stn_data.
----------
Parameters:
    stn_data ---
    cut_off --- cut off value for the extreme events
-------
Returns:
    stn_data --- 

-------
Example:
    find_hot_days (stn_data, cut_off)

'''
def find_hot_days (stn_data, cut_off):
        #if the day is hotter (or as hot) as our maximum temperature anomaly
        #create a label of 1; 0 if less than the cuttoff temperature anomaly
        stn_data['HOT'] = np.where(stn_data['TMAXANOM']>= cut_off,1,0)
            
        return stn_data

In [0]:
#call method that gets our data in the right format
#correct days, years, etc.
station_data = get_ghcnd_stn(ghcnd_csv_dir, station_id, start_year, end_year, start_doy, end_doy)

station_data.head()

Unnamed: 0,YYYY,MM,DD,TMAX,TMIN,PRCP,SNOW,SNWD,date,jday
631,1982,6,24,32.2,15.6,0.0,0.0,0.0,1982-06-24,175
632,1982,6,25,20.6,9.4,1.3,0.0,0.0,1982-06-25,176
633,1982,6,26,23.3,,0.0,0.0,0.0,1982-06-26,177
634,1982,6,27,27.2,12.8,0.0,0.0,0.0,1982-06-27,178
635,1982,6,28,31.7,15.6,0.0,0.0,0.0,1982-06-28,179


In [0]:
#calculate anomaly based on the maximum temperature variable
station_data2 = calc_stn_anom(station_data, 'TMAX')

station_data2.head()

Unnamed: 0,YYYY,MM,DD,TMAX,TMIN,PRCP,SNOW,SNWD,date,jday,TMAXANOM
631,1982,6,24,32.2,15.6,0.0,0.0,0.0,1982-06-24,175,2.938235
632,1982,6,25,20.6,9.4,1.3,0.0,0.0,1982-06-25,176,-8.567647
633,1982,6,26,23.3,,0.0,0.0,0.0,1982-06-26,177,-6.014706
634,1982,6,27,27.2,12.8,0.0,0.0,0.0,1982-06-27,178,-1.920588
635,1982,6,28,31.7,15.6,0.0,0.0,0.0,1982-06-28,179,2.338235


In [0]:
#calculate hot or not-hot labels based on our cutoff value
station_data3 = find_hot_days(station_data2, cut_off)

print(station_data3.HOT.unique())

[0 1]


In [0]:
print(np.count_nonzero(station_data3.HOT))

136


In [0]:
print(station_data3.HOT.shape)

(2040,)


In [0]:
#dev_nino_list = [1983, 1990, 1995, 2008]
#subset for development set
_1983 = station_data3[station_data3.YYYY == 1983]
_1990 = station_data3[station_data3.YYYY == 1990]
_1995 = station_data3[station_data3.YYYY == 1995]
_2008 = station_data3[station_data3.YYYY == 2008]

#create development set through concatenation
hot_dev = pd.concat([_1983, _1990, _1995, _2008])

print(hot_dev)

       YYYY  MM  DD  TMAX  TMIN  PRCP  SNOW  SNWD       date  jday   TMAXANOM  \
965    1983   6  24  32.2  17.2   0.0   0.0   0.0 1983-06-24   175   2.938235   
966    1983   6  25  33.3  18.9   0.0   0.0   0.0 1983-06-25   176   4.132353   
967    1983   6  26  32.8  18.9   0.0   0.0   0.0 1983-06-26   177   3.485294   
968    1983   6  27  27.2  17.2   0.0   0.0   0.0 1983-06-27   178  -1.920588   
969    1983   6  28  22.8  13.9  21.8   0.0   0.0 1983-06-28   179  -6.561765   
970    1983   6  29  16.7  13.9   8.9   0.0   0.0 1983-06-29   180 -13.011765   
971    1983   6  30  25.6  15.0   0.8   0.0   0.0 1983-06-30   181  -3.138235   
972    1983   7   1  30.6  13.9  44.2   0.0   0.0 1983-07-01   182   1.324242   
973    1983   7   2  30.0  14.4   0.3   0.0   0.0 1983-07-02   183   1.030303   
974    1983   7   3  32.8  18.9   3.8   0.0   0.0 1983-07-03   184   2.640625   
975    1983   7   4  21.1  11.1   1.3   0.0   0.0 1983-07-04   185  -9.515152   
976    1983   7   5  23.9  1

In [0]:
#get development set target lables
Y_dev = hot_dev.HOT.values

print(Y_dev.shape)

(240,)


In [0]:
#val_nino_list = [1988, 1994, 1999, 2003]
_1988 = station_data3[station_data3.YYYY == 1988]
_1994 = station_data3[station_data3.YYYY == 1994]
_1999 = station_data3[station_data3.YYYY == 1999]
_2003 = station_data3[station_data3.YYYY == 2003]

#create validation set through concatenation
hot_val = pd.concat([_1988, _1994, _1999, _2003])

#get labels so that we can save them
Y_val = hot_val.HOT.values

print(Y_val.shape)

(240,)


In [0]:
print(station_data3.YYYY.unique())

print(station_data3.head())

[1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
 2010 2011 2012 2013 2014 2015]
     YYYY  MM  DD  TMAX  TMIN  PRCP  SNOW  SNWD       date  jday  TMAXANOM  \
631  1982   6  24  32.2  15.6   0.0   0.0   0.0 1982-06-24   175  2.938235   
632  1982   6  25  20.6   9.4   1.3   0.0   0.0 1982-06-25   176 -8.567647   
633  1982   6  26  23.3   NaN   0.0   0.0   0.0 1982-06-26   177 -6.014706   
634  1982   6  27  27.2  12.8   0.0   0.0   0.0 1982-06-27   178 -1.920588   
635  1982   6  28  31.7  15.6   0.0   0.0   0.0 1982-06-28   179  2.338235   

     HOT  
631    0  
632    0  
633    0  
634    0  
635    0  


In [0]:
#the years in the dev and val sets
list_years = [1983, 1988, 1990, 1994, 1995, 1999, 2003, 2008]

#every year not in the dev or val sets should be in the training set
for i in list_years:
    station_data3 = station_data3[station_data3.YYYY != i]

Y_train = station_data3.HOT.values

print(Y_train.shape)

(1560,)


In [0]:
#create directories to save output to if they don't already exist
os.mkdir('/glade/work/jakidxav/IPython/20_lead/Y_train/station11/')
os.mkdir('/glade/work/jakidxav/IPython/20_lead/Y_dev/station11/')
os.mkdir('/glade/work/jakidxav/IPython/20_lead/Y_val/station11/')

#save train labels for a particular lead time
Y_train_filename = '/glade/work/jakidxav/IPython/20_lead/Y_train/station11/Y_train.txt'
with open(Y_train_filename, 'wb') as f:
   pickle.dump(Y_train, f)

#save dev labels for a particular lead time
Y_dev_filename = '/glade/work/jakidxav/IPython/20_lead/Y_dev/station11/Y_dev.txt'
with open(Y_dev_filename, 'wb') as g:
   pickle.dump(Y_dev, g)

#save validation labels for a particular lead time
Y_val_filename = '/glade/work/jakidxav/IPython/20_lead/Y_val/station11/Y_val.txt'
with open(Y_val_filename, 'wb') as h:
   pickle.dump(Y_val, h)

In [0]:
print(Y_val)

[0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0]


In [0]:
print(Y_val.shape)

(240,)
