<a href="https://colab.research.google.com/github/Maryam-Afshari/thesis/blob/master/xgboost_Lobith.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# checking out the GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Jun 19 07:20:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# checking out the memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [4]:
#!pip install hydroeval

# Running XGBoost model on Lobith station

In [5]:
# required libraries
import numpy as np
import pandas as pd
import hydroeval as he
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBRegressor

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# reading data 
lobith = pd.read_csv('/content/drive/MyDrive/Thesis-Afshari/data/lobith.csv',index_col=0)
q_lobith = pd.read_csv("/content/drive/MyDrive/Thesis-Afshari/data/q_lobith.csv",index_col=0)


In [8]:
# making lagged variables

# define the number of lags
TIME_STEPS = 55

#add the lagged variables for predictors to the dataframe
for i, var in enumerate(lobith[["et","p","t"]]):
  for step in range(0, TIME_STEPS - 1):
    lobith.insert(i*(TIME_STEPS) + 1,
                 f'{var}_lag_{TIME_STEPS - 1 - step}',
                 lobith[var].shift(TIME_STEPS - 1 - step))

#remove the first TIME_STEPS - 1 rows since they will contain NA values
lobith = lobith.iloc[TIME_STEPS - 1:,:].reset_index(drop=True)
q_lobith = q_lobith.iloc[TIME_STEPS - 1:,:].reset_index(drop=True)

  if self.run_code(code, result):


# Train_test split

In [9]:
# train-test split
len_ = int(0.75 * lobith.shape[0])

df_train = lobith[:len_]
df_test = lobith[len_:]

# x train without lagged vars
X_train = df_train[["et","t","p"]]

# x train with lagged vars
X_train_lagged = df_train.drop(columns = ["obs","datetime"], axis =1)
y_train = df_train.obs

# x test without lagged vars
X_test = df_test[["et","t","p"]]

# x test with lagged vars
X_test_lagged = df_test.drop(columns = ["obs","datetime"], axis =1)
y_test = df_test.obs


### Hyperparameter grid search

In [10]:
%%time
# Various hyper-parameters to tune for XGBoost model without input lagged variables 

xgb = XGBRegressor()
parameters = {'nthread':[1], 
              'objective':['reg:squarederror'],
              'learning_rate': [.01, .05, .1], 
              'max_depth': [7, 9], 
              'min_child_weight': [4],
              'subsample': [0.8, 0.9], 
              'colsample_bytree': [0.7, 0.9], 
              'n_estimators': [800, 1000]} 

xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 5,
                        n_jobs = 5,
                        verbose=True)


xgb_grid.fit(X_train,
         y_train)

print(f"best score for XGBoost model without lagged variables is : {xgb_grid.best_score_}")
print(f"best parameters XGBoost model without lagged variables are: {xgb_grid.best_params_}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
best score for XGBoost model without lagged variables is : -0.11648620281568008
best parameters XGBoost model without lagged variables are: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 800, 'nthread': 1, 'objective': 'reg:squarederror', 'subsample': 0.8}
CPU times: user 4.63 s, sys: 373 ms, total: 5 s
Wall time: 4min 35s


In [11]:
%%time
# Various hyper-parameters to tune foe XGBoost model with 55 input lagged variables 

xgb = XGBRegressor()
parameters = {'nthread':[1], 
              'objective':['reg:squarederror'], 
              'learning_rate': [.01, .05, .1], 
              'max_depth': [7, 9], 
              'min_child_weight': [4],
              'subsample': [0.8, 0.9], 
              'colsample_bytree': [0.7, 0.9], 
              'n_estimators': [800, 1000]} 

xgb_grid_lagged = GridSearchCV(xgb,
                        parameters,
                        cv = 5,
                        n_jobs = 5,
                        verbose=True)


xgb_grid_lagged.fit(X_train_lagged,
         y_train)

print(f"best score for XGBoost model with 55 lagged variables is : {xgb_grid.best_score_}")
print(f"best parameters XGBoost model with 55 lagged variables are: {xgb_grid.best_params_}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
best score for XGBoost model with 55 lagged variables is : -0.11648620281568008
best parameters XGBoost model with 55 lagged variables are: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 800, 'nthread': 1, 'objective': 'reg:squarederror', 'subsample': 0.8}
CPU times: user 2min 6s, sys: 4.27 s, total: 2min 10s
Wall time: 2h 3min 2s


# Model

In [17]:
# define the model for without lagged variables model
model =  xgb_grid.best_estimator_

# fit model
model.fit(X_train, y_train)

# make a prediction
yhat = model.predict(X_test)

In [18]:
# save model
model.save_model('/content/drive/MyDrive/Thesis-Afshari/output/xgboost_without_lag.model')

In [19]:
# define the model for with 60 lagged variables model
model_lagged =  xgb_grid_lagged.best_estimator_

# fit model
model_lagged.fit(X_train_lagged, y_train)
# make a prediction
yhat_lagged = model_lagged.predict(X_test_lagged)

In [20]:
# save model
model_lagged.save_model('/content/drive/MyDrive/Thesis-Afshari/output/xgboost_with_55lag.model')

# save prediction with lagged 
np.save("/content/drive/MyDrive/Thesis-Afshari/output/ypred_lobith_xgboost_55lag.npy", yhat_lagged, allow_pickle=True)


# Evaluation

In [21]:
# evaluate pcr model using nse and kge
streamflow_pcr = q_lobith.pcr[len_:]
nse_pcr = he.evaluator(he.nse, streamflow_pcr, y_test)
kge_pcr, r, alpha, beta = he.evaluator(he.kge, streamflow_pcr, y_test)
print("The nse and kge of the PCR model are {:.2} and {:.2}.".format(nse_pcr[0], kge_pcr[0]))

# evaluate the prediction using nse and kge for model with only meteorological variables
nse = he.evaluator(he.nse, yhat, y_test)

kge, r, alpha, beta = he.evaluator(he.kge, yhat, y_test)
print("The nse and kge of the xgboost model are {:.2} and {:.2}.".format(nse[0], kge[0]))

# evaluate the prediction using nse and kge for model with meteorological variables + lagged variables
nse_lag = he.evaluator(he.nse, yhat_lagged, y_test)
kge_lag, r_lag, alpha_lag, beta_lag = he.evaluator(he.kge, yhat_lagged, y_test)

print("The nse and kge of the xgboost model including lagged variables are {:.2} and {:.2}.".format(nse_lag[0], kge_lag[0]))



The nse and kge of the PCR model are 0.36 and 0.55.
The nse and kge of the xgboost model are 0.029 and 0.094.
The nse and kge of the xgboost model including lagged variables are 0.69 and 0.67.
