### Extract independent stations (not used for training).
**predictors**:

| name | Meaning | unit | New Name |
| --- | --- | --- | --- |
| EVI_SG_linear | Enhanced Vegetation Index | - | EVI|
| Evapo | Evaporation | m |
| LST_DAILY | daily land surface temperature | degree Celcius |LST_Daily |
| TI | Topographic index | - |
| Tair | Air tempereature (2m) | Degree Celcius | T_air |
| api | Antecedent Precipitation Index | - | API |
| clay, sand, silt | soil texture | % | Clay, Sand, Silt | 
| soil moisture | Volumetric soil moisture | cm3/cm3 | Soil Moisture |
| Preci | Precipitation | mm |
| porosity | Soil Porosity | - | Porosity |
| Date | YYYY-MM-DD | - |
| elevation | Elevation | m |





In [61]:
"""
--------------------------
File Name:  prepare_data.py
Author: zhang (FZJ/IBG3)
Contact: leojayak@gmail.com
Date: 02.03.22

Description: Prepare the data for training and testing.
--------------------------
"""
import pandas as pd
import os
import numpy as np
os.chdir('/p/home/jusers/zhang23/juwels/SSM')
# os.chdir('/home/zhang/SSM/')
file_train = 'train.csv'
file_validate = 'validate.csv'

# output
folder_output = 'output'
if not os.path.exists(folder_output):
    os.mkdir(folder_output)
df_out_train = pd.DataFrame()
df_out_validate = pd.DataFrame()

scale_factor = 0.1  # Determine the proportion of data not used in the training.

df_train = pd.read_csv(file_train, index_col=0)
df_validate = pd.read_csv(file_validate, index_col=0)

# remove the LST < 0.
df_lst_le_0_train = df_train[df_train['LST_DAILY'] < 0]
df_train = df_train.drop(df_lst_le_0_train.index, axis=0)

df_lst_le_0_validate = df_validate[df_validate['LST_DAILY'] < 0]
df_validate = df_validate.drop(df_lst_le_0_validate.index, axis=0)

# Remove 'ID'
df_train = df_train.drop('ID', axis=1)
df_validate = df_validate.drop('ID', axis=1)

print(df_train.columns)
print(df_validate.columns)
train_column_list = ['EVI', 'Evapo', 'LST_Daily', 'LST_Diff', 'NDVI', 'TI', 'T_air', 'API', 
               'Clay', 'Date', 'Elevation', 'lat', 'lon', 'network', 'OMC',
               'Porosity', 'Sand', 'Silt', 'Soil Moisture', 'station', 'Preci']

validate_columns_list = ['EVI', 'Evapo', 'LST_Daily', 'LST_Diff', 'NDVI', 'TI', 'T_air', 'API', 
               'Clay', 'Date', 'Elevation', 'ESA-CCI', 'lat', 'lon', 'network', 'OMC',
               'Porosity', 'Sand', 'Silt', 'Soil Moisture', 'station', 'Preci']

df_train.columns = train_column_list
df_validate.columns = validate_columns_list

# Get the networks from the training data.
networks = df_validate['network'].drop_duplicates()
print('Number of networks for the training data: ', len(networks))

df_excluded_station = pd.DataFrame(columns=['network', 'station', 'train_size','validate_size', 'lon', 'lat'], dtype='object')



Index(['EVI_SG_linear', 'Evapo', 'LST_DAILY', 'LST_Diff', 'NDVI_SG_linear',
       'TI', 'Tair', 'api', 'clay', 'date', 'elevation', 'lat', 'lon',
       'network', 'omc', 'porosity', 'sand', 'silt', 'soil moisture',
       'station', 'Preci'],
      dtype='object')
Index(['EVI_SG_linear', 'Evapo', 'LST_DAILY', 'LST_Diff', 'NDVI_SG_linear',
       'TI', 'Tair', 'api', 'clay', 'date', 'elevation', 'esa_cci', 'lat',
       'lon', 'network', 'omc', 'porosity', 'sand', 'silt', 'soil moisture',
       'station', 'Preci'],
      dtype='object')
Number of networks for the training data:  48


### Rename predictors.

In [62]:
print(df_train.columns)
print(df_validate.columns)

Index(['EVI', 'Evapo', 'LST_Daily', 'LST_Diff', 'NDVI', 'TI', 'T_air', 'API',
       'Clay', 'Date', 'Elevation', 'lat', 'lon', 'network', 'OMC', 'Porosity',
       'Sand', 'Silt', 'Soil Moisture', 'station', 'Preci'],
      dtype='object')
Index(['EVI', 'Evapo', 'LST_Daily', 'LST_Diff', 'NDVI', 'TI', 'T_air', 'API',
       'Clay', 'Date', 'Elevation', 'ESA-CCI', 'lat', 'lon', 'network', 'OMC',
       'Porosity', 'Sand', 'Silt', 'Soil Moisture', 'station', 'Preci'],
      dtype='object')


In [63]:
# Loops to get the station information in each network.
for idx_i, network in enumerate(networks):
    # dataframe for the network.
    df_network_train = df_train[df_train['network'] == network]
    df_network_validate = df_validate[df_validate['network'] == network]

    stations = df_network_validate['station'].drop_duplicates()  # Get the stations.
    n_stations = len(stations)  # Number of stations in the network.

    # excluded stations.
    n_excluded_stations = int(n_stations * scale_factor)
    excluded_stations = stations.values[np.random.choice(range(n_stations), n_excluded_stations)]
    print(idx_i, network,
          f'number of stations: {n_stations}, number of exclude stations: {n_excluded_stations}, {len(excluded_stations)}')

    # Remove the excluded data.
    df_network_train = df_network_train.set_index('station')
    df_network_train = df_network_train.drop(excluded_stations)

    df_network_validate = df_network_validate.set_index('station')
    df_network_validate = df_network_validate.drop(excluded_stations)

    # Save the excluded stations into a separate file.
    for idx_j, station in enumerate(excluded_stations):
        df_station_train = df_train[df_train['station'] == station]
        df_station_validate = df_validate[df_validate['station'] == station]
        df_station = pd.concat([df_station_train, df_station_validate])
        df_station.to_csv(os.path.join(folder_output, f'Independent_{network}_{station}.csv'))
        
        # collect process information.
        s_excluded_station = pd.Series(index=['network', 'station', 'train_size', 'validate_size','lon', 'lat'], dtype='object')
        s_excluded_station['network'] = network
        s_excluded_station['station'] = station
        s_excluded_station['train_size'] = len(df_station_train)
        s_excluded_station['validate_size'] = len(df_station_validate)
        s_excluded_station['lon'] = df_station_train['lon'].iloc[0]
        s_excluded_station['lat'] = df_station_train['lat'].iloc[0]

        df_excluded_station = df_excluded_station.append(s_excluded_station, ignore_index=True)

    df_out_train = df_out_train.append(df_network_train)
    df_out_validate = df_out_validate.append(df_network_validate)

df_out_train.to_csv('ML_training&testing_v01_20220303.csv')
df_out_validate.to_csv('ML_validating_v01_20220303.csv')
df_excluded_station.to_csv('Excluded_station_info.csv')

0 FMI number of stations: 18, number of exclude stations: 1, 1
1 GTK number of stations: 5, number of exclude stations: 0, 0
2 VAS number of stations: 2, number of exclude stations: 0, 0
3 AWDN number of stations: 45, number of exclude stations: 4, 4
4 HOBE number of stations: 30, number of exclude stations: 3, 3
5 MAQU number of stations: 19, number of exclude stations: 1, 1
6 RSMN number of stations: 10, number of exclude stations: 1, 1
7 SCAN number of stations: 189, number of exclude stations: 18, 18
8 SKKU number of stations: 3, number of exclude stations: 0, 0
9 WSMN number of stations: 6, number of exclude stations: 0, 0
10 iRON number of stations: 7, number of exclude stations: 0, 0
11 DAHRA number of stations: 1, number of exclude stations: 0, 0
12 OZNET number of stations: 34, number of exclude stations: 3, 3
13 RISMA number of stations: 21, number of exclude stations: 2, 2
14 USCRN number of stations: 102, number of exclude stations: 10, 10
15 COSMOS number of stations: 89, 

In [64]:
df = pd.read_csv('ML_training&testing_v01_20220303.csv')
df.columns

Index(['station', 'EVI', 'Evapo', 'LST_Daily', 'LST_Diff', 'NDVI', 'TI',
       'T_air', 'API', 'Clay', 'Date', 'Elevation', 'lat', 'lon', 'network',
       'OMC', 'Porosity', 'Sand', 'Silt', 'Soil Moisture', 'Preci'],
      dtype='object')

In [66]:
import numpy as np
from sklearn.model_selection import train_test_split

labels = df['Soil Moisture']
df = df.drop('Soil Moisture', axis=1)
feature_list = df.columns
features = np.array(df)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.25,
                                                                 random_state=42)


In [67]:
print(train_features.shape)
print(test_features.shape)
print(train_labels.shape)
print(test_labels.shape)

(352075, 20)
(117359, 20)
(352075,)
(117359,)


In [17]:
df.iloc[0]

EVI_SG_linear       0.0818455
Evapo             -0.00134076
ID                      61018
LST_DAILY                4.95
LST_Diff                 5.36
NDVI_SG_linear       0.247624
TI                    9.72459
Tair                  6.91391
api                   17.2963
clay                      9.4
date               2013-05-20
elevation             468.604
lat                   68.3302
lon                   27.5506
network                   FMI
omc                   37.6694
porosity             0.762264
sand                     55.5
silt                     35.1
Preci             0.000852346
Name: SAA111, dtype: object