In [1]:
# Import Packages
import os
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
from datetime import datetime, date
import boto3

In [2]:
# Change Directory
import os
os.chdir('data/')
os.listdir()

['gage_discharge_lat_lon.csv',
 'SN_SWE_WY2015.h5',
 'SN_SWE_WY1985.h5',
 '.ipynb_checkpoints']

## Objective: Join Time Series Discharge & SWE
1. Test joining on single gage

2. Table Joining Pipeline on all gages

### Connect to S3

In [3]:
access_key = 'AKIA2UES5IOH7GRA3BVK'
secret_access_key = '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'

bucket_name = "w210-snow-fate"
s3 = boto3.client("s3",
                  region_name="us-east-1",
                  aws_access_key_id = access_key,
                  aws_secret_access_key = secret_access_key
                  )

### Download File
s3.download_file(Bucket=bucket_name, Key="snowpack_paper.pdf", Filename="snowpack_paper.pdf")

### Upload File
s3.upload_file("snowpack_paper.pdf",bucket_name,"snowpack_paper.pdf")

In [5]:
## Test Download File Works or not
# s3.download_file(Bucket=bucket_name, Key="snowpack_paper.pdf", Filename="snowpack_paper.pdf")

## 1. Test Joining on Single Gage

In [6]:
### Function for Extracting Index
def index_finder(lon,lat):
    # Longtitude finder
    if lon < -123.3 or lon > -117.6:
        print('Longitude of Input is out of range! lon:',lon)
        return None
    elif lat < 35.4 or lat > 42:
        print('Latitude of Input is out of range! lat:',lat)
    else: #longtitude and latitude are within reasonable range
        lon_idx = round((lon + 123.3) * 1000)
        lat_idx = round((lat - 35.4) * 1000)
    
        return int(lon_idx),int(lat_idx)

### Read Data

In [7]:
# Discharge Data
gage = pd.read_csv('gage_discharge_lat_lon.csv')

# Create Features & assign initial values with -1
gage['swe_avg'] = -1
gage['swe_max'] = -1

In [8]:
gage.head()

Unnamed: 0,time,ft,m3,gage,ll_lon,ll_lat,tr_lon,tr_lat,swe_avg,swe_max
0,1984-10-01,54.0,1.52911,11402000,-121.157674,39.855478,-120.690823,40.049659,-1,-1
1,1984-10-02,52.0,1.472476,11402000,-121.157674,39.855478,-120.690823,40.049659,-1,-1
2,1984-10-03,49.0,1.387525,11402000,-121.157674,39.855478,-120.690823,40.049659,-1,-1
3,1984-10-04,49.0,1.387525,11402000,-121.157674,39.855478,-120.690823,40.049659,-1,-1
4,1984-10-05,48.0,1.359209,11402000,-121.157674,39.855478,-120.690823,40.049659,-1,-1


In [9]:
# Variable Defining
prev_year = 0

In [10]:
### 
for row_num in range(len(gage)):
    row_data = gage.iloc[row_num,:]
    row_time = row_data['time']
    if row_time[0:4] == '2015' and row_data['gage']==11189500:
        break

In [11]:
row_data

time       2015-01-01
ft               9.96
m3           0.282036
gage         11189500
ll_lon    -118.383732
ll_lat      35.728555
tr_lon    -118.003533
tr_lat      36.437843
swe_avg            -1
swe_max            -1
Name: 81905, dtype: object

In [12]:
### Extract Date from Gage Data to match SWE
date_format = "%Y-%m-%d"
d_date = datetime.strptime(row_time, date_format)

# Extract year of date
d_year = d_date.year
# Extract number of days from SWE Data
num_days = d_date- datetime.strptime('{}-1-1'.format(d_year),date_format)
num_days = num_days.days

print(f'Year: {d_year}  Day: {num_days}')


Year: 2015  Day: 0


In [13]:
### Obtain swe data
# if year of previous row does not match year of current row. Then load data
if d_year != prev_year: 
    h5_file = f"SN_SWE_WY{d_year}.h5"
    # 1. Download h5 file from S3 bucket
    s3.download_file(Bucket=bucket_name, Key=os.path.join('swe_data',h5_file), Filename=h5_file)
    # 2. Read Data
    swe = h5py.File(h5_file, 'r')
prev_year = d_year

swe_data = swe['SWE'][num_days]
# flip over yaxis as lats are in a descending order --> need to change to ascending order
swe_data_flip = swe_data[:,::-1]

#### Find SWE Interested Region with Lat Lon

In [14]:
### 4. Find closest idx to the lower left & upper right corner
ll_lon_idx,ll_lat_idx = index_finder(row_data['ll_lon'],row_data['ll_lat'])
tr_lon_idx,tr_lat_idx = index_finder(row_data['tr_lon'],row_data['tr_lat'])
region = swe_data_flip[ll_lon_idx:tr_lon_idx,ll_lat_idx:tr_lat_idx]


In [15]:
# Print Shape of array
print('Region Shape:',region.shape)
# Print 
print('Unique Value:',np.unique(region,return_counts=True))
region

Region Shape: (380, 709)
Unique Value: (array([-32768,      0], dtype=int16), array([ 39327, 230093]))


array([[-32768, -32768, -32768, ...,      0,      0,      0],
       [-32768, -32768, -32768, ...,      0,      0,      0],
       [-32768, -32768, -32768, ...,      0,      0,      0],
       ...,
       [     0,      0,      0, ..., -32768, -32768, -32768],
       [     0,      0,      0, ..., -32768, -32768, -32768],
       [-32768, -32768,      0, ..., -32768, -32768, -32768]], dtype=int16)

In [None]:
'''
 ---- OLD COORDINATES ---
Gage with swe values: 11266500 
Gage with no swe values: 11402000, 11318500,11208000
Gage with certain swe values: 11185500
Gage with limited swe values: 11189500, 11202710
Note: Gage 11202710 starts from 1988 while others start from 1985

 ---- SWE on NEW COORDINATES (10/15) ----
 11202710: 40,904 valid & 6616 n/a values from region of 198x240 array
 11266500: 126,687 valid & 2784 n/a values from region of 419x309 array
 11402000: 36,414 valid & 54,651 n/a values from region of 467x195 array
 11318500: 9510 valid & 29072 n/a values from region of 382x101 array
 11208000: 29,403 valid & 3461 n/a values from region of 208x158 array
 11185500: 348,392 valid & 21336 n/a values from region of 436x848 array
 11189500: 230,093 valid & 39327 n/a values from region of 380x709 array
 --------------------------------------------------
'''

### Convert -32768 to NA and save as numpy array

In [16]:
# change null values to null
region=region.astype('float')
region[region == -32768] = np.nan
region


array([[nan, nan, nan, ...,  0.,  0.,  0.],
       [nan, nan, nan, ...,  0.,  0.,  0.],
       [nan, nan, nan, ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ..., nan, nan, nan],
       [ 0.,  0.,  0., ..., nan, nan, nan],
       [nan, nan,  0., ..., nan, nan, nan]])

In [17]:
d_date

datetime.datetime(2015, 1, 1, 0, 0)

In [18]:
### Save Region Array as NPY to folder
npy_name = 'swe__'+str(row_data['gage'])+'__'+d_date.strftime('%Y_%m_%d')+'.npy'
#np.save(os.path.join('../swe_region_data',npy_name),region)
npy_name

'swe__11189500__2015_01_01.npy'

# 2. Pipeline: Data Joining Between Gage Time Series Data & SWE Value
- matching swe data to the gage time series data based on lat & lon of the gage
- **main code**

In [13]:
class gage_swe_join():
    def __init__(self):
        self.prev_year = 0
        self.date_format =  "%Y-%m-%d"
        
    def extract_swe_day_and_data(self,row_time):
        '''
        Objective:
            1. Obtain the year of the interested date
            2. Obtain the day of the year
            3. Extracted SWE data of the same date from gage data
    
        '''
        ### Extract Date from Gage Data to match SWE
        self.d_date = datetime.strptime(row_time, self.date_format)
        # Extract year of date
        d_year = self.d_date.year
        # Extract number of days from SWE Data
        num_days = self.d_date- datetime.strptime('{}-1-1'.format(d_year),self.date_format)
        num_days = num_days.days

        ## Obtain swe data
        # if year of previous row does not match year of current row. Then Load Data      
        if d_year != self.prev_year: 
            h5_file = f"SN_SWE_WY{d_year}.h5"
            # 1. Download h5 file from S3 bucket
            if not os.path.exists(h5_file): # if file is not in local folder, then download data
                print(f'\n-- Downloading file {h5_file} from s3 --\n')
                s3.download_file(Bucket=bucket_name, Key=os.path.join('swe_data',h5_file), Filename=h5_file)
            # 2. Read Data
            self.swe = h5py.File(h5_file, 'r')
        self.prev_year = d_year
        
        # extract swe date
        swe_data = self.swe['SWE'][num_days]

        # flip over yaxis as lats are in a descending order --> need to change to ascending order
        swe_data_flip = swe_data[:,::-1]

        return swe_data_flip
    
    ### 2. Function for Extracting Index
    def index_finder(self,lon,lat):
        # Longtitude finder
        if lon < -123.3 or lon > -117.6:
            print('Longitude of Input is out of range! lon:',lon)
            return None
        elif lat < 35.4 or lat > 42:
            print('Latitude of Input is out of range! lat:',lat)
        else: #longtitude and latitude are within reasonable range
            lon_idx = round((lon + 123.3) * 1000)
            lat_idx = round((lat - 35.4) * 1000)

            return int(lon_idx),int(lat_idx)

    def pipeline(self):
        '''
            Objective: Pipeline for joining the entire gage time series table with swe data
            Input:
                - gage discharge data (date, discharge, gage lat &lon)
                - swe data (date, lat& lon, swe)
            Output:
                - Region Data Save to Npy file in format 
                     "swe__gage__yy_mm_dd.npy"
        '''
        # read gage csv file
        gage = pd.read_csv('gage_discharge_lat_lon.csv')

        ### Run through all data
        for ii,row_num in enumerate(range(len(gage))):

            ### Start from row xx
            if ii >= 0:

                if ii % 100 == 0:
                    print(f'-------- Processing Row Number {ii} out of {len(gage)} ---------')
                ####
                # Start with each row
                row_data = gage.iloc[row_num,:]
                row_time = row_data['time']

                # NOTE: SWE has only data files from 1984 - 2016
                if 1984 < int(row_time[0:4]) <2017  : 
                    ### Obtain Value of Interested Region
                    ll_lon_idx,ll_lat_idx = self.index_finder(row_data['ll_lon'],row_data['ll_lat'])
                    tr_lon_idx,tr_lat_idx = self.index_finder(row_data['tr_lon'],row_data['tr_lat'])

                    # Obtain SWE data of Same Date
                    swe_data= self.extract_swe_day_and_data(row_time)

                    # get SWE values of surrounding region
                    region = swe_data[ll_lon_idx:tr_lon_idx,ll_lat_idx:tr_lat_idx]

                    # change -32768 (null values) to null
                    region=region.astype('float')
                    region[region == -32768] = np.nan
                    
                    ### Save Region Array as NPY to folder
                    npy_name = 'swe__'+str(row_data['gage'])+'__'+self.d_date.strftime('%Y_%m_%d')+'.npy'
                    np.save(os.path.join('../swe_region_data',npy_name),region)
                    
          
        return gage


In [15]:
# run class
gsj = gage_swe_join()
final = gsj.pipeline()

-------- Processing Row Number 0 out of 83274 ---------
1985-01-01 00:00:00
1985-01-02 00:00:00
1985-01-03 00:00:00
1985-01-04 00:00:00
1985-01-05 00:00:00
1985-01-06 00:00:00
1985-01-07 00:00:00
1985-01-08 00:00:00
-------- Processing Row Number 100 out of 83274 ---------
1985-01-09 00:00:00


In [None]:
#################################################################