# Build Model to Predict Discharge Based on SWE

In [6]:
!pip install pandas
!pip install scikit-learn
!pip install scikit-image

[0mCollecting scikit-image
  Downloading scikit_image-0.19.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting networkx>=2.2
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting PyWavelets>=1.1.1
  Downloading PyWavelets-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting tifffile>=2019.7.26
  Downloading tifffile-2022.10.10-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.3/210.3 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
Collecting imageio>=2.4.1
  Downloading imag

##### Author: Kevin
##### Date: 10/22/2022
#### Objective: construct model to Predict Discharge with SWE Regional Data

In [7]:
# Import packages
import os
import h5py
import numpy as np
import pandas as pd
import copy
import sklearn
import time
import skimage

## Data Processing

In [3]:
# Read Data
data = pd.read_csv('data/gage_with_swe1019.csv')
print('Total Number of rows:',len(data))

Total Number of rows: 86926


In [4]:
data.head()

Unnamed: 0,gage,time,ft,m3,ll_lon,ll_lat,tr_lon,tr_lat,swe_avg,swe_max
0,11402000,1984-10-01,54.0,1.52911,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1
1,11402000,1984-10-02,52.0,1.472476,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1
2,11402000,1984-10-03,49.0,1.387525,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1
3,11402000,1984-10-04,49.0,1.387525,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1
4,11402000,1984-10-05,48.0,1.359209,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1


### Data Quality Check & Cleaning

In [5]:
def filter_dates(data,gage,date,s_or_g):
    '''
        Objective: Filter out data that has missing values in Water Discharge
        Input:
            - data: gage_with_swe.csv {pandas dataframe}
            - gage: gage id (ex: 11208000) {int}
            - date: date (ex: 'YYYY-mm-dd') {str}
            - s_or_g: 's'(smaller) or 'g'(greater) Please only input s or g.
    '''
    # 1. Delete all rows with gage xx
    new = data.drop(data.loc[data['gage']==gage].index)
    # 2. Get Interested Dates
    if s_or_g == 's':
        add = data[(data['gage'] == gage) & (pd.to_datetime(data['time']) <= pd.to_datetime(date))]
    elif s_or_g == 'g':
        add = data[(data['gage'] == gage) & (pd.to_datetime(data['time']) >= pd.to_datetime(date))]
    else:
        print("ERROR in Code. Please specify either s or g for the 4th parameter")
    # 3. add data back to new
    new = pd.concat([new,add],axis=0)
    
    return new
    

In [6]:
### Data Processing
# Replace -1 to missing values
data['swe_avg'] = data['swe_avg'].replace(-1,np.nan)
data = data.dropna(subset = ['swe_avg'])
# Filter out dates specified by Zixi 
data = filter_dates(data,11208000,'2002-07-01','s')
data = filter_dates(data,11202710,'1992-01-01','g')
data = filter_dates(data,11185500,'2013-01-01','s')
data = filter_dates(data,11189500,'1997-01-01','g')
# fill the single missing value
data.loc[52593,'m3'] = 0.368119
# Drop ft column & Reset Index
data = data.drop(['ft'],axis=1)
data = data.reset_index(drop=True)

In [7]:
# Check Missing Value
data.isnull().sum()

gage       0
time       0
m3         0
ll_lon     0
ll_lat     0
tr_lon     0
tr_lat     0
swe_avg    0
swe_max    0
dtype: int64

--------------------------------------------
### Focus on Gage 11402000
Create ML algorithm based model

- Input: SWE regional data on gage 11402000
- Output: gage dicharage rate

In [8]:
# 11402000
gage_id = 11402000
gage_data = data[data['gage'] == gage_id]
gage_cols =gage_data.columns

In [9]:
# Time (1), m3 (2)
gage_arr = np.array(gage_data)
gage_arr.shape

(11688, 9)

#### Get X (axb Regional data) & Y (Discharge)
**NOTE**: Only need to run once. Otherwise Do not run this. Run Data Splitting that Loads the Data

In [10]:
# Read Each Region File
X = [] # region data
Y = [] # label

start = time.time()
for num in range(len(gage_arr)):
    # Obtain Date
    gage_time = gage_arr[num,1].replace('-','_') # gage_time: '1985_01_06'
    # Npy name (ex: swe__11189500__2008_12_31.npy)
    npy_name = f'swe__{str(gage_id)}__{gage_time}.npy'
    # load numpy file
    arr = np.load(os.path.join('swe_region',npy_name))
    
    # Change Missing Value to 0
    arr[np.isnan(arr)] = 0
    
    # # Apply Maxpooling on arr (467x195 --> 234,98)
    # arr = skimage.measure.block_reduce(arr, (2,2), np.max)
    
    ## Append into list
    X.append(arr)
    Y.append(gage_arr[num,2])

print('Time to Run the Program:',time.time()-start)



Time to Run the Program: 58.492607831954956


In [16]:
print('X shape',np.array(X).shape)
print('Y shape',np.array(Y).shape)

X shape (11688, 467, 195)
Y shape (11688,)


In [15]:
np.save('11402000/X_original.npy',np.array(X))
np.save('11402000/Y_original.npy',np.array(Y))

## Data Modeling

In [8]:
# Load Data
X = np.load('11402000/X_original.npy')
Y = np.load('11402000/Y_original.npy')

In [9]:
# Shift Y variable by one day
X = X[1:len(X)]
Y = Y[:len(Y)-1]
assert len(X) == len(Y), "Size of X & Y does not match"

In [10]:
# Split data
train_size = int(len(X) *0.8)
print(f'Training Size: {train_size} (~0.8)')
print(f'Test Size: {len(X)-train_size} (~0.2)')
train_x = X[:train_size]
train_y = Y[:train_size]
test_x = X[train_size:]
test_y = Y[train_size:]

Training Size: 9349 (~0.8)
Test Size: 2338 (~0.2)


In [11]:
train_x.shape

(9349, 467, 195)

In [12]:
# Reshape
train_x = np.expand_dims(train_x,axis=3)
test_x= np.expand_dims(test_x,axis=3)

### CNN Algorithm via TF

#### Evaluation Metric

In [13]:
### Function for Model Evaluation
from sklearn.metrics import mean_squared_error
import math

def mape(actual, pred): 
    ## Calculating Mean Absolute Percentage Error
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

def relative_root_mean_squared_error(true, pred):
    num = np.sum(np.square(true - pred))
    den = np.sum(np.square(pred))
    squared_error = num/den
    rrmse_loss = np.sqrt(squared_error)
    return rrmse_loss

def evaluation(model,train_x,train_y,test_x,_test_y):
    train_pred = model.predict(train_x)
    train_pred = np.where(train_pred<0,0,train_pred)
    test_pred = model.predict(test_x)
    test_pred = np.where(test_pred<0,0,test_pred)
    mse_train = mean_squared_error(train_y,train_pred)
    mse_test = mean_squared_error(test_y,test_pred)
    rrmse_train = relative_root_mean_squared_error(train_y, train_pred)
    rrmse_test = relative_root_mean_squared_error(test_y,test_pred)
    

    print('Root Mean Squared Error on Train:',math.sqrt(mse_train))
    print('MApe on Train:',mape(train_y,train_pred))
    print('RRMSE on Train:',rrmse_train)
    print("Root Mean Squared Error on Test:",math.sqrt(mse_test))
    print('MApe on Test:',mape(test_y,test_pred))
    print('RRMSE on Test:',rrmse_test)
    
    
    return test_pred

#### 1. Simple CNN Architecture

In [7]:
from keras.models import Sequential, Model
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Input

In [8]:
# Model Architecture
input_layer = Input(shape=(467,195,1))
x = Conv2D(filters=64, kernel_size=(3,3), padding="same", activation="relu")(input_layer)
x = MaxPool2D(pool_size=(2,2))(x)
x = Conv2D(filters=32, kernel_size=(3,3), padding="same", activation="relu")(x)
x = Flatten()(x)
x = Dense(84, activation="relu")(x)
x = Dense(32, activation="relu")(x)

x = Dense(1)(x)
 
model = Model(inputs=input_layer, outputs=x)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 467, 195, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 467, 195, 64)      640       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 233, 97, 64)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 233, 97, 32)       18464     
                                                                 
 flatten (Flatten)           (None, 723232)            0         
                                                                 
 dense (Dense)               (None, 84)                60751572  
                                                             

In [9]:
#compile model using accuracy to measure model performance
model.compile(loss= "mean_squared_error" , optimizer="adam")

In [None]:
#train the model
model.fit(train_x, train_y, validation_data=(test_x, test_y), epochs=3,shuffle=True)

In [None]:
evaluation(model,train_x,train_y,test_x,test_y)

#### 2. Pretrained CNN Network

In [15]:
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.6.0.66-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (60.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: opencv-python
Successfully installed opencv-python-4.6.0.66
[0m

In [16]:
from keras.applications.vgg16 import VGG16, preprocess_input
import cv2
from keras.models import Sequential, Model
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Input, Concatenate, Dropout

ImportError: libGL.so.1: cannot open shared object file: No such file or directory

In [None]:
### Change Data Size to VGG accepted dimension input ###
def resize_arr(arr): # arr: [batch_size, xshape,yshape,1]
    # Initialize an array: final
    final = np.zeros((arr.shape[0],224,224,1))
    
    # Go Thourgh Each Data
    for i in range(arr.shape[0]):
        # Resize image from 400x200 to 224x224
        img_resize = cv2.resize(arr[i],(224,224),interpolation = cv2.INTER_AREA)
        final[i] = np.expand_dims(img_resize,axis=-1)
                                
    return final

In [None]:
### Change Dimension to model accepted inputs
train_xT = resize_arr(train_x)
test_xT = resize_arr(test_x)

In [11]:
######## VGG MODEL ##############
inputs = Input(shape=(224, 224, 1))
conc = Concatenate()([inputs, inputs, inputs]) 
conv_base = VGG16(include_top=False,
                     weights='imagenet', input_tensor=conc)
                     #input_shape=input_shape) # input shape: 224x224x3

    
# set number of layers to freeze
fine_tune = 2

if fine_tune > 0:
    for layer in conv_base.layers[:-fine_tune]:
        layer.trainable = False
else:
    for layer in conv_base.layers:
        layer.trainable = False

# Create a new 'top' of the model (i.e. fully-connected layers).
# This is 'bootstrapping' a new top_model onto the pretrained layers.
top_model = conv_base.output
top_model = Flatten(name="flatten")(top_model)
top_model = Dense(4096, activation='relu')(top_model)
top_model = Dense(1072, activation='relu')(top_model)
top_model = Dropout(0.2)(top_model)
output = Dense(1)(top_model)
 
vgg_model = Model(inputs=inputs, outputs=output)


# Compiles the model for training.
vgg_model.compile(loss= "mean_squared_error" , optimizer="adam")


In [12]:
vgg_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 224, 224, 1  0           []                               
                                )]                                                                
                                                                                                  
 concatenate (Concatenate)      (None, 224, 224, 3)  0           ['input_1[0][0]',                
                                                                  'input_1[0][0]',                
                                                                  'input_1[0][0]']                
                                                                                                  
 block1_conv1 (Conv2D)          (None, 224, 224, 64  1792        ['concatenate[0][0]']        

In [15]:
# Train VGG Model
vgg_model.fit(train_xT, train_y, validation_data=(test_xT, test_y), epochs=1,batch_size = 8,shuffle=True)



<keras.callbacks.History at 0x7ff55c34bb20>

In [16]:
# Evaluate VGG Model
evaluation(vgg_model,train_xT,train_y,test_xT,test_y)

NameError: name 'evaluation' is not defined