# Build Model to Predict Discharge Based on SWE

##### Author: Kevin
##### Date: 10/17/2022
#### Objective: construct model to Predict Discharge with Current SWE features

In [1]:
# Import packages
import os
import h5py
import numpy as np
import pandas as pd
import copy
import sklearn

In [2]:
# Read Data
data = pd.read_csv('gage_with_swe.csv')
print('Total Number of rows:',len(data))

Total Number of rows: 83274


In [3]:
data.head()

Unnamed: 0,time,ft,m3,gage,ll_lon,ll_lat,tr_lon,tr_lat,swe_avg,swe_max
0,1984-10-01,54.0,1.52911,11402000,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1.0
1,1984-10-02,52.0,1.472476,11402000,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1.0
2,1984-10-03,49.0,1.387525,11402000,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1.0
3,1984-10-04,49.0,1.387525,11402000,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1.0
4,1984-10-05,48.0,1.359209,11402000,-121.157674,39.855478,-120.690823,40.049659,-1.0,-1.0


### Data Quality Check & Cleaning

In [4]:
# Replace -1 to missing values
data['swe_avg'] = data['swe_avg'].replace(-1,np.nan)

In [5]:
data = data.dropna(subset = ['swe_avg'])

In [6]:
# Check Missing Value
data.isnull().sum()
### Result: 57 missing values in ft & 1 in m3

time        0
ft         57
m3          0
gage        0
ll_lon      0
ll_lat      0
tr_lon      0
tr_lat      0
swe_avg     0
swe_max     0
dtype: int64

In [7]:
# Drop ft column
data = data.drop(['ft'],axis=1)

In [8]:
data.head()

Unnamed: 0,time,m3,gage,ll_lon,ll_lat,tr_lon,tr_lat,swe_avg,swe_max
92,1985-01-01,2.6901,11402000,-121.157674,39.855478,-120.690823,40.049659,0.000741,1.0
93,1985-01-02,2.60515,11402000,-121.157674,39.855478,-120.690823,40.049659,0.0,0.0
94,1985-01-03,2.60515,11402000,-121.157674,39.855478,-120.690823,40.049659,0.0,0.0
95,1985-01-04,2.60515,11402000,-121.157674,39.855478,-120.690823,40.049659,0.0,0.0
96,1985-01-05,2.548516,11402000,-121.157674,39.855478,-120.690823,40.049659,0.0,0.0


### Data Splitting

In [9]:
# Order Data by Date
data['time'] = pd.to_datetime(data['time'])
data = data.set_index(data['time'])
data = data.sort_index()

In [10]:
# Take only interested columns
data = data[['m3','swe_avg','swe_max']]

In [11]:
data.head()

Unnamed: 0_level_0,m3,swe_avg,swe_max
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1985-01-01,2.6901,0.000741,1.0
1985-01-01,8.268519,0.0,0.0
1985-01-01,0.792872,0.0,0.0
1985-01-01,3.341388,0.0,0.0
1985-01-01,0.20105,0.0,0.0


In [12]:
train = data[:'2010-01-01']
test  = data['2010-01-01':]
print(f'training size: {len(train)} ({round(len(train)/len(data),2)})')
print(f'test size: {len(test)} ({round(len(test)/len(data),2)})')

training size: 61094 (0.78)
test size: 17169 (0.22)


In [13]:
# Shuffle Data
train = train.sample(frac=1)
test = test.sample(frac=1)

In [14]:
# Get X & Y
train_x = train[['swe_avg','swe_max']]
train_y = train['m3']
test_x = test[['swe_avg','swe_max']]
test_y = test['m3']

### Model Building

In [20]:
####### 1. Logistic Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(train_x,train_y)

lr_pred = lr.predict(test_x)

In [25]:
# Evaluation
from sklearn.metrics import mean_squared_error
import math
mse_train = mean_squared_error(train_y,lr.predict(train_x))
mse_test = mean_squared_error(test_y,lr_pred)

print('Root Mean Squared Error on Train',math.sqrt(mse_train))
print("Root Mean Squared Error on Test:",math.sqrt(mse_test))

Root Mean Squared Error on Train 13.365887114533262
Root Mean Squared Error on Test: 10.177950604486579
