In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [24]:
pred_basel = pd.read_csv("data/basel.csv",index_col=0)
pred_basel = pred_basel.set_index("datetime")

In [25]:
q_basel = pd.read_csv("data/q_basel.csv",index_col=0)
q_basel

Unnamed: 0_level_0,obs,pcr,res
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1981-01-01,670.000,605.06915,64.93085
1981-01-02,647.000,599.35767,47.64233
1981-01-03,727.000,874.38354,-147.38354
1981-01-04,1363.000,998.86804,364.13196
1981-01-05,1202.000,968.06270,233.93730
...,...,...,...
2000-12-27,662.035,1055.62610,-393.59110
2000-12-28,656.253,1027.90730,-371.65430
2000-12-29,652.150,860.76306,-208.61306
2000-12-30,641.549,655.83777,-14.28877


### Feature Engineering

In [26]:
# making 50 lagged variables
TIME_STEPS = 50
is_lag = TIME_STEPS > 1
if is_lag:
  #add the lagged variables to the dataframe
  for i, var in enumerate(pred_basel[["et","p","t"]]):
    for step in range(0, TIME_STEPS - 1):
      pred_basel.insert(i*(TIME_STEPS) + 1,
                        f'{var}_lag_{TIME_STEPS - 1 - step}',
                        pred_basel[var].shift(TIME_STEPS - 1 - step))

# remove the first TIME_STEPS - 1 rows since they are now NA values
pred_basel = pred_basel.iloc[TIME_STEPS - 1:,:].reset_index(drop=True)
q_basel = q_basel.iloc[TIME_STEPS - 1:,:].reset_index(drop=True)

  if (await self.run_code(code, result,  async_=asy)):


### Train-test split

In [27]:
#define number of observations and the train split proportion
# predictors and output var
len_ = int(0.75 * pred_basel.shape[0])

df_train = pred_basel[:len_]
df_test = pred_basel[len_:]

# x train without lagged vars
X_train = df_train[["et","t","p"]]

# x train with lagged vars
X_train_lagged = df_train.drop("obs", axis =1)
y_train = df_train.obs

# x test without lagged vars
X_test = df_test[["et","t","p"]]

# x test with lagged vars
X_test_lagged = df_test.drop("obs", axis =1)
y_test = df_test.obs

In [28]:
X_train_lagged

Unnamed: 0,et,et_lag_1,et_lag_2,et_lag_3,et_lag_4,et_lag_5,et_lag_6,et_lag_7,et_lag_8,et_lag_9,...,t_lag_40,t_lag_41,t_lag_42,t_lag_43,t_lag_44,t_lag_45,t_lag_46,t_lag_47,t_lag_48,t_lag_49
0,0.000532,0.000553,0.000571,0.000580,0.000543,0.000507,0.000452,0.000483,0.000623,0.000770,...,-5.646553,-8.999503,-10.731832,-6.366458,-2.218504,-2.055391,1.019687,5.165919,2.172385,-0.250816
1,0.000547,0.000532,0.000553,0.000571,0.000580,0.000543,0.000507,0.000452,0.000483,0.000623,...,-5.925995,-5.646553,-8.999503,-10.731832,-6.366458,-2.218504,-2.055391,1.019687,5.165919,2.172385
2,0.000569,0.000547,0.000532,0.000553,0.000571,0.000580,0.000543,0.000507,0.000452,0.000483,...,-5.499934,-5.925995,-5.646553,-8.999503,-10.731832,-6.366458,-2.218504,-2.055391,1.019687,5.165919
3,0.000593,0.000569,0.000547,0.000532,0.000553,0.000571,0.000580,0.000543,0.000507,0.000452,...,-4.769065,-5.499934,-5.925995,-5.646553,-8.999503,-10.731832,-6.366458,-2.218504,-2.055391,1.019687
4,0.000644,0.000593,0.000569,0.000547,0.000532,0.000553,0.000571,0.000580,0.000543,0.000507,...,-2.298974,-4.769065,-5.499934,-5.925995,-5.646553,-8.999503,-10.731832,-6.366458,-2.218504,-2.055391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5437,0.000813,0.000705,0.000777,0.000723,0.000530,0.000416,0.000591,0.000626,0.000697,0.000568,...,-0.870762,0.681613,2.573417,4.436017,3.448374,1.560776,2.177809,3.985582,2.886545,-2.100167
5438,0.000827,0.000813,0.000705,0.000777,0.000723,0.000530,0.000416,0.000591,0.000626,0.000697,...,-0.964901,-0.870762,0.681613,2.573417,4.436017,3.448374,1.560776,2.177809,3.985582,2.886545
5439,0.000885,0.000827,0.000813,0.000705,0.000777,0.000723,0.000530,0.000416,0.000591,0.000626,...,2.404177,-0.964901,-0.870762,0.681613,2.573417,4.436017,3.448374,1.560776,2.177809,3.985582
5440,0.000861,0.000885,0.000827,0.000813,0.000705,0.000777,0.000723,0.000530,0.000416,0.000591,...,2.084055,2.404177,-0.964901,-0.870762,0.681613,2.573417,4.436017,3.448374,1.560776,2.177809


### Normalize the data

In [29]:
# normalising for without lagged

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

# normalising data with lagged
scaler_lagged = MinMaxScaler()

X_train_lagged = scaler_lagged.fit_transform(X_train_lagged)

X_test_lagged = scaler_lagged.transform(X_test_lagged)

### Model

In [30]:
# creating an object of LinearRegression class
lm = LinearRegression()

# fitting the training data
lm.fit(X_train,y_train)

#evaluate the model
lm.score(X_test, y_test)

0.12108233009697922

In [31]:
# creating an object of LinearRegression class
lm_lag = LinearRegression()

# fitting the training data
lm_lag.fit(X_train_lagged,y_train)

#evaluate the model
lm_lag.score(X_test_lagged, y_test)

0.6610153261270394

### Evaluation

In [35]:
streamflow_pcr = q_basel.pcr[len_:]

# evaluate pcr model using nse and kge
nse_pcr = he.evaluator(he.nse, streamflow_pcr,y_test)
kge_pcr, r_pcr, alpha_pcr, beta_pcr = he.evaluator(he.kge, streamflow_pcr, y_test)
print("The nse and kge of the PCR model are {:.2} and {:.2}.".format(nse_pcr[0], kge_pcr[0]))

# predicting for model with only meteorological variables
y_pred =  lm.predict(X_test)

# predicting for model with meteorological variables + lagged variables
y_pred_lagged = lm_lag.predict(X_test_lagged)

# evaluate the prediction using nse and kge for model with only meteorological variables
nse = he.evaluator(he.nse, y_pred, y_test)
kge, r, alpha, beta = he.evaluator(he.kge, y_pred, y_test)

# evaluate the prediction using nse and kge for model with meteorological variables + lagged variables
nse_lag = he.evaluator(he.nse, y_pred_lagged, y_test)
kge_lag, r_lag, alpha_lag, beta_lag = he.evaluator(he.kge, y_pred_lagged, y_test)
print("The nse and kge of the linear model are {:.2} and {:.2}.".format(nse[0], kge[0]))
print("The nse and kge of the linear model including lagged variables are {:.2} and {:.2}.".format(nse_lag[0], kge_lag[0]))

The nse and kge of the PCR model are 0.22 and 0.64.
The nse and kge of the linear model are 0.12 and 0.12.
The nse and kge of the linear model with lag variables are 0.66 and 0.71.
