## Estimate surface soil moisture based on the Random Forest Regressor model
This script use the 

Use RandomizedSearchCV to find the optimized parameters for the Random forest regressor model.
Train the RF model with the optimized setting.
validate the result with testing, validation and independent station dataset.

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime


In [3]:
# -------------------------
# Define the work directory
# Open the file and read the data.
# -------------------------
# work_dir = r'/home/zhang/SSM/Input_data/ML_SSM_dataset_v1_20220317'
work_dir = r'/home/jovyan/shared/ML_DL_SoilMoisture/Input_data/ML_SSM_dataset_v1_20220317'
os.chdir(work_dir)

file = r'ML_training&testing_v01shuffled_20220317.csv'
df = pd.read_csv(file)

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Split the training&testing data into training dataset and testing dataset, respectively.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [5]:
# ---------------------------------------------
# Train the model with the optimized parameters
# ----------------------------------------------
from scipy.stats import pearsonr
# Train the model
t0 = datetime.now()
rf = RandomForestRegressor(n_estimators=10, n_jobs=6, min_samples_split=4, 
                           min_samples_leaf=2, max_features='log2', max_depth=None)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
t1 = datetime.now()
time_consuming = (t1 - t0).seconds
print('Training time: %d seconds' % time_consuming)

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)
pearson_r = pearsonr(y_test, pred)[0]
print(f'MSE: {mse}, RMSE: {rmse}, r2: {r2}, Pearson_r: {pearson_r}')

Training time: 4 seconds
MSE: 0.0017002330892085284, RMSE: 0.04123388278113678, r2: 0.8632969534837176, Pearson_r: 0.9303913945429015


In [6]:
# ------------------------------------------------
# validating
# ------------------------------------------------
file_val = 'ML_validating_v01_20220303.csv'

# ----------------------------------------------------------
# Add 'DOY', 'Year', and modify the sequence of the columns.
# ----------------------------------------------------------
def format_val_file(file_val, index_col):
    """
    The columns is kid of different from the 'shuffled_20220317.csv'
    :param file_val: File for validate.
    :return: X_val and y_val
    """
    df_val = pd.read_csv(file_val, index_col=index_col)
    dataset2 = pd.to_datetime(df_val['Date'])
    DOY = dataset2.dt.dayofyear
    Year = dataset2.dt.year
    df_val.insert(0, 'DOY', DOY)
    df_val.insert(0, 'Year', Year)
    df_val.drop(labels=['Date', 'station', 'ESA-CCI', 'network'], axis=1, inplace=True)
    y_val = df_val.iloc[:, -2].values
    df_val.drop(labels='Soil Moisture', axis=1, inplace=True)
    X_val = df_val.iloc[:, :].values

    return X_val, y_val

df_val = pd.read_csv(file_val)
X_val, y_val = format_val_file(file_val, index_col=None)


pred_val = rf.predict(X_val)

df_val = pd.read_csv(file_val)
stations_val = df_val['station'].unique()  # Get the station names

# metrix for validation set.
df_metrics_val = pd.DataFrame(columns=['station', 'mse', 'rmse', 'r2', 'pearson_r', 'NumberOfData'], dtype='object')

for station in stations_val:

    # Get the index for data from 'station'
    idx = df_val.index[df_val['station'] == station]

    if len(idx) >= 2:
        S_metrics = pd.Series(index=['station', 'mse', 'rmse', 'r2', 'pearson_r', 'NumberOfData'], dtype='object')
        mse_idx = mean_squared_error(y_val[idx], pred_val[idx])
        rmse_idx = np.sqrt(mse_idx)
        r2_idx = r2_score(y_val[idx], pred_val[idx])
        pearson_r_idx = pearsonr(y_val[idx], pred_val[idx])[0]

        S_metrics['station'] = station
        S_metrics['mse'] = mse_idx
        S_metrics['rmse'] = rmse_idx
        S_metrics['r2'] = r2_idx
        S_metrics['pearson_r'] = pearson_r_idx
        S_metrics['NumberOfData'] = len(idx)
        del mse_idx, rmse_idx, r2_idx, pearson_r_idx

        df_metrics_val.loc[station] = S_metrics


mse = mean_squared_error(y_val, pred_val)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, pred_val)
pearson_r = pearsonr(y_val, pred_val)[0]
print('Validation: ')
print(f'MSE: {mse}, RMSE: {rmse}, r2: {r2}, Pearson_r:, {pearson_r}')
print('--------------- \n\n')
df_metrics_val.to_csv(r'/home/jovyan/private/ML_SSM/RF_validation_metrics.cvs')




pred_val = rf.predict(X_val)
mse = mean_squared_error(y_val, pred_val)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, pred_val)
pearson_r = pearsonr(y_val, pred_val)[0]
print(f'MSE: {mse}, RMSE: {rmse}, r2: {r2}, Pearson_r: {pearson_r}')



Validation: 
MSE: 0.0035666740988801464, RMSE: 0.059721638447719654, r2: 0.7138858473933332, Pearson_r:, 0.8452194714299485
--------------- 


MSE: 0.0035666740988801464, RMSE: 0.059721638447719654, r2: 0.7138858473933332, Pearson_r: 0.8452194714299486


In [None]:
# -----------------------
# Independent stations validation
# --------------------
folder_in = os.path.join(work_dir, 'output')
files = os.listdir(folder_in)
df_independent_metrics = pd.DataFrame(columns=['network', 'station', 'MSE', 'RMSE', 'R2', 'Pearson_r', 'n_size'], dtype='object')


for idx, file in enumerate(files):
    print(idx, file)
    # Read the data from independent stations.
    X_val_in, y_val_in = format_val_file(os.path.join(folder_in, file), index_col=0)
    pred_val_in = rf.predict(X_val_in)
    
    
    
    
    mse = mean_squared_error(y_val_in, pred_val_in)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val_in, pred_val_in)
    pearson_r = pearsonr(y_val_in, pred_val_in)
    
    df_val_in = pd.read_csv(os.path.join(folder_in, file))

    s_val_in = pd.Series(index=['network', 'station', 'MSE', 'RMSE', 'R2', 'Pearson_r', 'n_size'], dtype='object')
    s_val_in['network'] = file.split('_')[1]
    s_val_in['station'] = file.split('_')[2]
    s_val_in['MSE'] = mse
    s_val_in['RMSE'] = rmse
    s_val_in['R2'] = r2
    s_val_in['Pearson_r'] = pearson_r[0]
    s_val_in['n_size'] = len(df_val_in)
    # print(idx, s_val_in)

    df_independent_metrics = df_independent_metrics.append(s_val_in, ignore_index=True)
    del s_val_in
print(df_independent_metrics.head(5))
df_independent_metrics.to_csv(r'/home/jovyan/private/ML_SSM/RF_independent_metrics.cvs')