In [1]:
import os,sys
import random
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib as plt
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import shuffle

In [2]:
### Enter the data folder name and WCR information

# select fold
fold_idx = 1

# Load the data folder path
path = os.getcwd()
train_path_upper = path + f'/data_mortar/FOLD_{fold_idx}/train'
valid_path_upper = path + f'/data_mortar/FOLD_{fold_idx}/valid'
test_path_upper = path + f'/data_mortar/FOLD_{fold_idx}/test'

# WCR information
WC_name_tag = [40, 42.5, 45, 47.5, 50, 52.5, 55, 57.5, 60]

# Fit the random seed
seed_data = 4885
random.seed(seed_data)
np.random.seed(seed_data)
os.environ["PYTHONHASHSEED"] = str(seed_data)

In [3]:
### Data normalization

# Normalize to the minimum and maximum sensor measurement values.
def sense_min_max_Normalization(input_data, min_value, max_value):
    data = (input_data - min_value) / (max_value - min_value) 
    
    return data

def data_set_normalization(data):
    # sensor_min_value
    vwc_min = 0
    ec_min = 0
    salinity_min = 0
    tds_min = 0
    epsilon_min = 0
    temp_min = -40
    
    # sensor_max_value
    vwc_max = 100
    ec_max = 20000
    salinity_max = 20000
    tds_max = 20000
    temp_max = 80
    epsilon_max = 82
    
    # sensor_normalization 
    data['TEMP'] = sense_min_max_Normalization(data['TEMP'],temp_min,temp_max)
    data['EC'] = sense_min_max_Normalization(data['EC'],ec_min,ec_max)
    data['VWC'] = sense_min_max_Normalization(data['VWC'],vwc_min,vwc_max)
    data['TDS'] = sense_min_max_Normalization(data['TDS'],tds_min,tds_max)
    data['SALINITY'] = sense_min_max_Normalization(data['SALINITY'],salinity_min,salinity_max)
    data['EPSILON'] = sense_min_max_Normalization(data['EPSILON'],epsilon_min,epsilon_max)
    
    return data

In [4]:
### Load FDR Data for Train

path_list = []
path_under=os.listdir(train_path_upper)

# load data_path
for i in range(len(path_under)): # collect under directory file path
    path_sub=os.listdir(train_path_upper+'/'+path_under[i])
    
    for j in range(len(path_sub)):
        path_list.append(train_path_upper+'/'+path_under[i]+'/'+path_sub[j])

# make zero matrix
data_list = [0]*len(path_list)
rand_x_data_list = [0]*len(path_list)
rand_y_data_list = [0]*len(path_list)

# load data_value
for i,j in enumerate(path_list):
    data_list[i] =pd.read_csv(j)
    data_list[i] = data_set_normalization(data_list[i])
    
train_data = pd.concat(data_list)

# Print the data format
train_data

Unnamed: 0,WC_ratio,TEMP,EC,VWC,TDS,SALINITY,EPSILON
0,0.4,0.514917,0.19240,0.6391,0.09620,0.10580,0.720488
1,0.4,0.514417,0.19580,0.6512,0.09790,0.10765,0.736098
2,0.4,0.514417,0.19600,0.6549,0.09800,0.10780,0.740610
3,0.4,0.514250,0.19630,0.6568,0.09815,0.10795,0.742927
4,0.4,0.513833,0.19660,0.6587,0.09830,0.10810,0.745244
...,...,...,...,...,...,...,...
468,0.6,0.495333,0.28745,0.9264,0.14370,0.15805,0.955366
469,0.6,0.495333,0.28750,0.9374,0.14375,0.15810,0.961341
470,0.6,0.495333,0.28795,0.9320,0.14395,0.15835,0.958415
471,0.6,0.495333,0.28795,0.9320,0.14395,0.15835,0.958415


In [5]:
### Load FDR Data for Valid

valid_path_list = []
valid_path_under=os.listdir(valid_path_upper)

# load data_path
for i in range(len(valid_path_under)): # collect under directory file path
    valid_path_sub=os.listdir(valid_path_upper+'/'+valid_path_under[i])
    
    for j in range(len(valid_path_sub)):
        valid_path_list.append(valid_path_upper+'/'+valid_path_under[i]+'/'+valid_path_sub[j])

# make zero matrix
valid_data_list = [0]*len(valid_path_list)
valid_rand_x_data_list = [0]*len(valid_path_list)
valid_rand_y_data_list = [0]*len(valid_path_list)

# load data_value
for i,j in enumerate(valid_path_list):
    valid_data_list[i] =pd.read_csv(j)
    valid_data_list[i] = data_set_normalization(valid_data_list[i])
    
valid_data = pd.concat(valid_data_list)

# Print the data format
valid_data

Unnamed: 0,WC_ratio,TEMP,EC,VWC,TDS,SALINITY,EPSILON
0,0.4,0.491083,0.18165,0.6014,0.09080,0.09990,0.665610
1,0.4,0.491083,0.18250,0.6014,0.09125,0.10035,0.665610
2,0.4,0.490833,0.18360,0.6052,0.09180,0.10095,0.671707
3,0.4,0.490833,0.18405,0.6052,0.09200,0.10120,0.671707
4,0.4,0.490583,0.18455,0.6092,0.09225,0.10150,0.677927
...,...,...,...,...,...,...,...
455,0.6,0.492417,0.28650,0.9488,0.14325,0.15755,0.967439
456,0.6,0.492667,0.28650,0.9488,0.14325,0.15755,0.967439
457,0.6,0.492667,0.28650,0.9488,0.14325,0.15755,0.967439
458,0.6,0.492417,0.28655,0.9488,0.14325,0.15760,0.967439


In [6]:
### Load FDR Data for Test

test_path_list = []
test_path_under=os.listdir(test_path_upper)

# load data_path
for i in range(len(test_path_under)): # collect under directory file path
    test_path_sub=os.listdir(test_path_upper+'/'+test_path_under[i])
    
    for j in range(len(test_path_sub)):
        test_path_list.append(test_path_upper+'/'+test_path_under[i]+'/'+test_path_sub[j])

# make zero matrix
test_data_list = [0]*len(test_path_list)
test_rand_x_data_list = [0]*len(test_path_list)
test_rand_y_data_list = [0]*len(test_path_list)

# load data_value
for i,j in enumerate(test_path_list):
    test_data_list[i] =pd.read_csv(j)
    test_data_list[i] = data_set_normalization(test_data_list[i])
    
test_data = pd.concat(test_data_list)

# Print the data format
test_data

Unnamed: 0,WC_ratio,TEMP,EC,VWC,TDS,SALINITY,EPSILON
0,0.4,0.491500,0.15550,0.5570,0.07775,0.08550,0.585610
1,0.4,0.491500,0.15625,0.5588,0.07810,0.08590,0.589146
2,0.4,0.491083,0.15690,0.5597,0.07845,0.08625,0.590976
3,0.4,0.490833,0.15750,0.5597,0.07875,0.08660,0.590976
4,0.4,0.490833,0.15805,0.5597,0.07900,0.08690,0.590976
...,...,...,...,...,...,...,...
513,0.6,0.487667,0.29310,0.9488,0.14655,0.16120,0.967439
514,0.6,0.487667,0.29355,0.9488,0.14675,0.16145,0.967439
515,0.6,0.487667,0.29400,0.9488,0.14700,0.16170,0.967439
516,0.6,0.487667,0.29355,0.9488,0.14675,0.16145,0.967439


In [7]:
# GT-label for Train (WCR)
y_R = round(train_data['WC_ratio']*100, 1)
y_train = y_R.to_numpy()

# FDR sensor data for Train
x_R = train_data[['TEMP','VWC','EPSILON','SALINITY','TDS','EC']]
x_train = x_R.to_numpy()

x_train_shuffled, y_train_shuffled = shuffle(x_train, y_train, random_state=seed_data)

In [8]:
# GT-label for Valid (WCR)
y_R_valid = round(valid_data['WC_ratio']*100, 1)
y_valid = y_R_valid.to_numpy()

# FDR sensor data for Valid
x_R_valid = valid_data[['TEMP','VWC','EPSILON','SALINITY','TDS','EC']]
x_valid = x_R_valid.to_numpy()

x_valid_shuffled, y_valid_shuffled = shuffle(x_valid, y_valid, random_state=seed_data)

In [9]:
# GT-label for Test (WCR)
y_R_test = round(test_data['WC_ratio']*100, 1)
y_test = y_R_test.to_numpy()

# FDR sensor data for Test
x_R_test = test_data[['TEMP','VWC','EPSILON','SALINITY','TDS','EC']]
x_test = x_R_test.to_numpy()

x_test_shuffled, y_test_shuffled = shuffle(x_test, y_test, random_state=seed_data)

In [10]:
model=RandomForestRegressor(max_depth=20, random_state=seed_data)
model.fit(x_train_shuffled, y_train_shuffled)

RandomForestRegressor(max_depth=20, random_state=4885)

In [11]:
# print model's parameter

params = model.get_params()

params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 20,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 4885,
 'verbose': 0,
 'warm_start': False}

In [16]:
### Print the prediction results

p_result_0 = model.predict(x_test_shuffled)

p_result_0

array([42.875, 57.5  , 42.175, ..., 43.8  , 59.075, 45.25 ])

In [17]:
### print RMSE and R2-score

rmse_0 = round(mean_squared_error(y_test_shuffled, p_result_0, squared=False), 4)
r2_0 = round(r2_score(y_test_shuffled, p_result_0), 4)

print(f"RMSE : {rmse_0}")
print(f"r2_score : {r2_0}")

RMSE : 2.0835
r2_score : 0.8999
