In [2]:
!pip install pandas==1.1.5 &> /dev/nul
!pip install pickle-mixin &> /dev/nul

In [20]:
import numpy as np
import pandas as pd
import pickle as pckl 
from sklearn.preprocessing import RobustScaler
from sklearn.manifold import TSNE
from google.colab import auth
import logging.config
import sys
import difflib as dl

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [4]:
auth.authenticate_user()

# Import Data
from google.colab import drive
drive.mount('/content/drive')
root = "/content/drive/My Drive/Colab Notebooks/Nino Causal Models/Data/" 
data = pd.read_csv(root+"FCDO_data.csv") # Import prepared data

Mounted at /content/drive


In [5]:
class Prepare_Data:
    def __init__(self, data):
        """
        Class function to preprocess data to data that can we used for feature engineering. Imports the raw data and exports data that 
        immediatly fits random forest model. Since we are using random forest model, it is not necessary to scale data. Random forest models
        ares also able to deal with colinearity within features. Also since there are so many columns we do not rename them yet.
        :param :
        :param :
        """
            # Create logger
        log_format = '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s'
        logging.basicConfig(format=log_format, level=logging.INFO, stream=sys.stdout)
        logger = logging.getLogger()
        self.data = data
        self.logger = logging.getLogger(__name__)

    def prepare_data(self):
        """
        This functions starts the pre processing activities for the random forest model
        :return:
        """
        self.logger.info("start preprocessing data")

        # Change the object types
        self.change_object_type()
        # Deal with awkward column string names
        self.deal_with_string_column_names()
        # Add new features based on range
        self.include_range_data()
        # Deal with missing data
        self.deal_with_missing_data()

        return self.data

    def change_object_type(self):
        """
        Since all the data are floats, we convert the data type.
        :return:
        """

        self.logger.info("start initiating data")
        data = self.data

        # Eliminate all rows where there is no sales price present
        data.replace(['--'], [np.nan], inplace=True)  # replace -- by np.nan
        data.loc[:, data.columns != 'ADM3_EN'] = data.loc[:, data.columns != 'ADM3_EN'].apply(pd.to_numeric, errors='raise')
        self.data=data

    def deal_with_string_column_names(self):
        """
        This functions adjusts column names.
        :return:
        """

        self.logger.info("deal with missing data")
        data = self.data
        
        # #Strip symbols from column names for efficient column selection
        data.columns = data.columns.str.replace("'","")
        data.columns = data.columns.str.replace("''","")
        data.columns = data.columns.str.replace("(","")
        data.columns = data.columns.str.replace(")","")

        self.data = data

    def include_range_data(self):
        """
        Create new features based on range between the minimum and the maximum value of that feature.
        :return:
        """

        self.logger.info("Start adding range features.")
        data = self.data
        
        # Find columns minimums and maximums. (Note: there are no min/max columns (without capital letter))
        min_cols = [col for col in data.columns if 'Min' in col]
        max_cols = [col for col in data.columns if 'Max' in col]
        
        # Find which column from min corresponds with max: #https://docs.python.org/3/library/difflib.html#difflib.get_close_matches
        for min_col in min_cols:
            max_col = dl.get_close_matches(min_col, max_cols, n=1)[0]
            rangename=min_col.replace('Min','Range')
            data[rangename] = data[max_col]-data[min_col]

        self.data=data
        
    def deal_with_missing_data(self):
        """
        This functions deals with missing data and replaces it with the nodes. 
        :return:
        """

        self.logger.info("deal with missing data")
        data = self.data
        
        #Replace inf by NaN values
        data.replace([np.inf, -np.inf], np.nan, inplace=True)
        
        #Fill NaN values
        data.fillna(data.mean(), inplace=True)
        
        self.data = data

In [6]:
class nonlinear_dimensionality_reduction:
  def __init__(self, data_to_reduce, variables):
    self.data = data_to_reduce[variables]

  def transform_data(self):
    #value = (value – median) / (p75 – p25)
    transformer = RobustScaler().fit(self.data)
    data_transformed = transformer.transform(self.data)
    return data_transformed

  def embed_data(self):
    # Just create one feature (n_component) out of list of features based on TSNE by minimizing the divergence between two distributions: https://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf
    embeddingTSNE = TSNE(n_components=1, init='pca', random_state=0, perplexity=50.0, early_exaggeration=12.0, learning_rate=200.0, 
                     n_iter=10000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', verbose=0, method='barnes_hut', 
                     angle=0.5, n_jobs=-1)
    embedded_data = embeddingTSNE.fit_transform(nonlinear_dimensionality_reduction.transform_data(self))
    kl_divergence = embeddingTSNE.kl_divergence_
    return embedded_data, kl_divergence

In [7]:
pre_processing = Prepare_Data(data)
prepared_data = pre_processing.prepare_data()

2021-12-21 16:17:46,947 - __main__ - INFO - prepare_data - start preprocessing data
2021-12-21 16:17:46,949 - __main__ - INFO - change_object_type - start initiating data
2021-12-21 16:17:47,225 - __main__ - INFO - deal_with_string_column_names - deal with missing data
2021-12-21 16:17:47,230 - __main__ - INFO - include_range_data - Start adding range features.
2021-12-21 16:17:47,274 - numexpr.utils - INFO - _init_num_threads - NumExpr defaulting to 2 threads.
2021-12-21 16:17:48,703 - __main__ - INFO - deal_with_missing_data - deal with missing data


In [8]:
list_of_variables = [['spi_3_m_2', 'spi_6_m_2', 'spi_12_m_2', 'spi_24_m_2'],
               ['et_actl_m_MIN_m_2', 'et_actl_m_MAX_m_2', 'et_actl_m_MEAN_m_2', 'et_actl_m_STD_m_2'], 
               ['et_anom_m_MIN_m_2', 'et_anom_m_MAX_m_2', 'et_anom_m_MEAN_m_2', 'et_anom_m_STD_m_2'],
               ['rurpop_s_2', 'urbpop_s_2', 'rurratio_s_2'],
               ['locdensity_y_2', 'loccount_y_2'],
               ['spam_P_i_sum_s_2', 'spam_P_i_max_s_2', 'spam_P_i_avg_s_2', 'spam_P_i_ws_s_2', 'spam_P_r_sum_s_2', 'spam_P_r_max_s_2', 'spam_P_r_avg_s_2', 'spam_P_r_ws_s_2', 'rainfed_s_2', 'spam_P_t_sum_s_2', 'spam_V_agg_i_sum_s_2', 'spam_V_agg_i_max_s_2', 'spam_V_agg_i_avg_s_2', 'spam_V_agg_i_ws_s_2', 'spam_V_agg_r_sum_s_2', 'spam_V_agg_r_max_s_2', 'spam_V_agg_r_avg_s_2', 'spam_V_agg_r_ws_s_2', 'spam_V_agg_t_sum_s_2'],
               ['Cropland2000_mean_percent_s_2'],
               ['yield_gap_barley_s_2', 'yield_gap_maize_s_2', 'yield_gap_rice_s_2', 'yield_gap_soybean_s_2'], 
               ['Pasture2000_mean_percent_s_2'],
               ['buffalo_number_s_2', 'cattle_number_s_2', 'chicken_number_s_2', 'duck_number_s_2', 'goat_number_s_2', 'horse_number_s_2', 'pig_number_s_2', 'sheep_number_s_2'],
               ['DeliveredkcalFraction_s_2']]

names = ['spi', 'et_actl', 'et_anom', 'rural_urban_pop', 'local_pop', 'agriculture', 'cropland', 'crops', 'pasture', 'livestock', 'delivered_calories']

In [9]:
non_predictive_columns = ['fatalities, Riots', 'fatalities, Battles', 'fatalities, Protests',
                                      'Battles','Explosions/Remote violence','Protests','Riots','Strategic developments',
                                      'Violence against civilians','total_event_types','fatalities, Explosions/Remote violence',
                                      'fatalities, Strategic developments','fatalities, Violence against civilians','total_fatalities, ']

predictive_variables = prepared_data.loc[:, ~prepared_data.columns.isin(non_predictive_columns)]
predictive_variables.set_index('ADM3_EN', inplace=True)
names = ['Water_Runoff','Coastal','Landslide','Storm_Surface','Riverine','Precipitation'
             ,'Evapotranspiration','Skin_Reservoir','Evaporation','Soil_Temperature'
             ,'Radiative_Temperature','Soil_Water','LeafArea','Soil_Level','Latent_Heat'
             ,'Soil_Level','Soil_Heat','Wind_Speed','Soil_Moisture','Surface_Pressure'
             ,'Vapor_Pressure', 'Groundwater_Runoff','Sensible_Heat','Humidity','Dew'
             ,'Density','Heatwave','Water_Deficit','Surface_Air','Temperature']

grouped_columns_category = []
leftover_features = predictive_variables.columns
for i in range(0,len(names)):
    grouped_features = [col for col in predictive_variables.columns if names[i] in col]
    grouped_columns_category.append(grouped_features)
    leftover_features = [feature for feature in leftover_features if feature not in grouped_features]
print(leftover_features)

# Only leftover features are food related hence:
grouped_columns_category.append(leftover_features)
names.append('Food')

grouped_columns_category
names

['Cropland_Sum', 'Cropland_SD', 'Pasture_Sum', 'Pasture_SD', 'Cattle_Sum', 'Cattle_SD', 'Chicken_Sum', 'Chicken_SD', 'Ducks_Sum', 'Ducks_SD', 'Goats_Sum', 'Goats_SD', 'Pigs_Sum', 'Pigs_SD', 'Sheep_Sum', 'Sheep_SD']


['Water_Runoff',
 'Coastal',
 'Landslide',
 'Storm_Surface',
 'Riverine',
 'Precipitation',
 'Evapotranspiration',
 'Skin_Reservoir',
 'Evaporation',
 'Soil_Temperature',
 'Radiative_Temperature',
 'Soil_Water',
 'LeafArea',
 'Soil_Level',
 'Latent_Heat',
 'Soil_Level',
 'Soil_Heat',
 'Wind_Speed',
 'Soil_Moisture',
 'Surface_Pressure',
 'Vapor_Pressure',
 'Groundwater_Runoff',
 'Sensible_Heat',
 'Humidity',
 'Dew',
 'Density',
 'Heatwave',
 'Water_Deficit',
 'Surface_Air',
 'Temperature',
 'Food']

In [10]:
#Alternative non-linear dimensionality reduction methods have also been tried. Another alternative to t-distributed stochastic neighbor embedding (t-SNE) that is also stochastic is variational autoencoders (VAE).
#VAEs have more potential, but are also more complex, and their parameterization can also be a source of error. Since the t-SNE method gave better results, we settled on the t-SNE method.
  
data_list = []
listed_errors = []
for i in range(len(grouped_columns_category)):
  print(grouped_columns_category[i])
  variables = grouped_columns_category[i]
  data_reduction = nonlinear_dimensionality_reduction(prepared_data, grouped_columns_category[i])
  reduced_data, reconstruction_error = data_reduction.embed_data()
  data_list.append(reduced_data)
  listed_errors.append(reconstruction_error)

['IDAHO_EPSCOR_TERRACLIMATE_Max_Water_Runoff', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Water_Runoff', 'IDAHO_EPSCOR_TERRACLIMATE_Mean_Water_Runoff', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Water_Runoff', 'IDAHO_EPSCOR_TERRACLIMATE_Min_Water_Runoff', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Water_Runoff', 'IDAHO_EPSCOR_TERRACLIMATE_Range_Water_Runoff', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Range_Water_Runoff']




['inuncoast_historical_nosub_hist_Coastal_Flood_Max', 'inuncoast_historical_nosub_hist_Coastal_Flood_SD_Max', 'inuncoast_historical_nosub_hist_Coastal_Flood_Min', 'inuncoast_historical_nosub_hist_Coastal_Flood_SD_Min', 'inunriverine_historical_nosub_hist_Coastal_Riverine_SD_Max', 'inunriverine_historical_nosub_hist_Coastal_Riverine_SD_Min', 'inuncoast_historical_nosub_hist_Coastal_Flood_Range', 'inuncoast_historical_nosub_hist_Coastal_Flood_SD_Range', 'inunriverine_historical_nosub_hist_Coastal_Riverine_SD_Range']




['hazard_ls_arup_Landslide_Max', 'hazard_ls_arup_Landslide_SD_Max', 'hazard_ls_arup_Landslide_Mean', 'hazard_ls_arup_Landslide_SD_Mean', 'hazard_ls_arup_Landslide_Min', 'hazard_ls_arup_Landslide_SD_Min', 'hazard_ls_arup_Landslide_Range', 'hazard_ls_arup_Landslide_SD_Range']




['NASA_NOAH01_Max_Storm_Surface_Runoff', 'NASA_NOAH01_SD_Max_Storm_Surface_Runoff', 'NASA_NOAH01_Mean_Storm_Surface_Runoff', 'NASA_NOAH01_SD_Mean_Storm_Surface_Runoff', 'NASA_NOAH01_Min_Storm_Surface_Runoff', 'NASA_NOAH01_SD_Min_Storm_Surface_Runoff', 'NASA_NOAH01_Range_Storm_Surface_Runoff', 'NASA_NOAH01_SD_Range_Storm_Surface_Runoff']




['inunriverine_historical_nosub_hist_Riverine_Flood_Max', 'inunriverine_historical_nosub_hist_Coastal_Riverine_SD_Max', 'inunriverine_historical_nosub_hist_Riverine_Flood_Min', 'inunriverine_historical_nosub_hist_Coastal_Riverine_SD_Min', 'inunriverine_historical_nosub_hist_Riverine_Flood_Range', 'inunriverine_historical_nosub_hist_Coastal_Riverine_SD_Range']




['GPM_v6_Max_Precipitation', 'GPM_v6_SD_Max_Precipitation', 'GPM_v6_Max_Quality_Precipitation', 'GPM_v6_SD_Max_Quality_Precipitation', 'GPM_v6_Mean_Precipitation', 'GPM_v6_SD_Mean_Precipitation', 'GPM_v6_Mean_Quality_Precipitation', 'GPM_v6_SD_Mean_Quality_Precipitation', 'GPM_v6_Min_Precipitation', 'GPM_v6_SD_Min_Precipitation', 'GPM_v6_Min_Quality_Precipitation', 'GPM_v6_SD_Min_Quality_Precipitation', 'IDAHO_EPSCOR_TERRACLIMATE_Max_Precipitation_Accumulation', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Precipitation_Accumulation', 'IDAHO_EPSCOR_TERRACLIMATE_Mean_Precipitation_Accumulation', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Precipitation_Accumulation', 'IDAHO_EPSCOR_TERRACLIMATE_Min_Precipitation_Accumulation', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Precipitation_Accumulation', 'NASA_NOAH01_Max_Total_Precipitation_Rate', 'NASA_NOAH01_SD_Max_Total_Precipitation_Rate', 'NASA_NOAH01_Mean_Total_Precipitation_Rate', 'NASA_NOAH01_SD_Mean_Total_Precipitation_Rate', 'NASA_NOAH01_Min_Total_Precipitation_R



['FAO_WAPOR_Max_Actual_Evapotranspiration', 'FAO_WAPOR_SD_Max_Actual_Evapotranspiration', 'FAO_WAPOR_Mean_Actual_Evapotranspiration', 'FAO_WAPOR_SD_Mean_Actual_Evapotranspiration', 'FAO_WAPOR_Min_Actual_Evapotranspiration', 'FAO_WAPOR_SD_Min_Actual_Evapotranspiration', 'FMODIS006_MOD16A2_Max_Total_Evapotranspiration', 'FMODIS006_MOD16A2_SD_Max_Total_Evapotranspiration', 'FMODIS006_MOD16A2_Max_Total_Potential_Evapotranspiration', 'FMODIS006_MOD16A2_SD_Max_Total_Potential_Evapotranspiration', 'FMODIS006_MOD16A2_Min_Total_Evapotranspiration', 'FMODIS006_MOD16A2_SD_Min_Total_Evapotranspiration', 'FMODIS006_MOD16A2_Min_Total_Potential_Evapotranspiration', 'FMODIS006_MOD16A2_SD_Min_Total_Potential_Evapotranspiration', 'IDAHO_EPSCOR_TERRACLIMATE_Max_Actual_Evapotranspiration', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Actual_Evapotranspiration', 'IDAHO_EPSCOR_TERRACLIMATE_Max_Reference_Evapotranspiration', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Reference_Evapotranspiration', 'IDAHO_EPSCOR_TERRACLIMATE_Mea



['ECMWF_ERA5_LAND_MONTHLY_Max_Skin_Reservoir', 'ECMWF_ERA5_SD_LAND_MONTHLY_Max_Skin_Reservoir', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Skin_Reservoir', 'ECMWF_ERA5_SD_LAND_MONTHLY_Mean_Skin_Reservoir', 'ECMWF_ERA5_LAND_MONTHLY_Min_Skin_Reservoir', 'ECMWF_ERA5_SD_LAND_MONTHLY_Min_Skin_Reservoir', 'ECMWF_ERA5_LAND_MONTHLY_Range_Skin_Reservoir', 'ECMWF_ERA5_SD_LAND_MONTHLY_Range_Skin_Reservoir']




['ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Bare', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Bare', 'ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Potential', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Potential', 'ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Top_Canopy', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Top_Canopy', 'ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Total', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Total', 'ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Veget_Transpire', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Veget_Transpire', 'ECMWF_ERA5_LAND_MONTHLY_Max_Evaporation_Water_Excluding_Oceans', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Evaporation_Water_Excluding_Oceans', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Evaporation_Bare', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_Evaporation_Bare', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Evaporation_Potential', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_Evaporation_Potential', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Evaporation_Top_Canopy', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_Evaporation_Top_Canopy'



['NASA_NOAH01_Max_Soil_Temperature_0cm_10cm', 'NASA_NOAH01_SD_Max_Soil_Temperature_0cm_10cm', 'NASA_NOAH01_Max_Soil_Temperature_100cm_200cm', 'NASA_NOAH01_SD_Max_Soil_Temperature_100cm_200cm', 'NASA_NOAH01_Max_Soil_Temperature_10cm_40cm', 'NASA_NOAH01_SD_Max_Soil_Temperature_10cm_40cm', 'NASA_NOAH01_Max_Soil_Temperature_40cm_100cm', 'NASA_NOAH01_SD_Max_Soil_Temperature_40cm_100cm', 'NASA_NOAH01_Mean_Soil_Temperature_0cm_10cm', 'NASA_NOAH01_SD_Mean_Soil_Temperature_0cm_10cm', 'NASA_NOAH01_Mean_Soil_Temperature_100cm_200cm', 'NASA_NOAH01_SD_Mean_Soil_Temperature_100cm_200cm', 'NASA_NOAH01_Mean_Soil_Temperature_10cm_40cm', 'NASA_NOAH01_SD_Mean_Soil_Temperature_10cm_40cm', 'NASA_NOAH01_Mean_Soil_Temperature_40cm_100cm', 'NASA_NOAH01_SD_Mean_Soil_Temperature_40cm_100cm', 'NASA_NOAH01_Min_Soil_Temperature_0cm_10cm', 'NASA_NOAH01_SD_Min_Soil_Temperature_0cm_10cm', 'NASA_NOAH01_Min_Soil_Temperature_100cm_200cm', 'NASA_NOAH01_SD_Min_Soil_Temperature_100cm_200cm', 'NASA_NOAH01_Min_Soil_Temperatu



['NASA_NOAH01_Max_Surface_Radiative_Temperature', 'NASA_NOAH01_SD_Max_Surface_Radiative_Temperature', 'NASA_NOAH01_Mean_Surface_Radiative_Temperature', 'NASA_NOAH01_SD_Mean_Surface_Radiative_Temperature', 'NASA_NOAH01_Min_Surface_Radiative_Temperature', 'NASA_NOAH01_SD_Min_Surface_Radiative_Temperature', 'NASA_NOAH01_Range_Surface_Radiative_Temperature', 'NASA_NOAH01_SD_Range_Surface_Radiative_Temperature']




['ECMWF_ERA5_LAND_MONTHLY_Max_Layer1_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Layer1_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_Max_Layer2_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Layer2_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_Max_Layer3_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Layer3_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_Max_Layer4_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Layer4_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Layer1_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_Layer1_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Layer2_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_Layer2_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Layer3_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_Layer3_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Layer4_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_Layer4_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_Min_Layer1_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Min_Layer1_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_Min_Layer2_Soil_Water', 'ECMWF_ERA5_LAND_MONTHLY_SD_Min_L



['ECMWF_ERA5_LAND_MONTHLY_Max_High_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_High_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_Max_Low_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Low_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_Mean_High_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_High_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Low_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_Low_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_Min_High_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_SD_Min_High_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_Min_Low_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_SD_Min_Low_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_Range_High_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_SD_Range_High_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_Range_Low_LeafArea', 'ECMWF_ERA5_LAND_MONTHLY_SD_Range_Low_LeafArea']




['ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_Max_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_SD_Max_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_Mean_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_SD_Mean_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_Min_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_SD_Min_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_Max_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_SD_Max_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_Mean_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_SD_Mean_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_Min_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_SD_Min_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level3_Max_Temperature_28cm_100cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level3_SD_Max_Temperature_28cm_100cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level3_Mean_Temperature_28cm_100cm', 'ECMWF_ERA5_L



['FMODIS006_MOD16A2_Max_Average_Latent_Heat_Flux', 'FMODIS006_MOD16A2_SD_Max_Average_Latent_Heat_Flux', 'FMODIS006_MOD16A2_Max_Average_Potential_Latent_Heat_Flux', 'FMODIS006_MOD16A2_SD_Max_Average_Potential_Latent_Heat_Flux', 'FMODIS006_MOD16A2_Min_Average_Latent_Heat_Flux', 'FMODIS006_MOD16A2_SD_Min_Average_Latent_Heat_Flux', 'FMODIS006_MOD16A2_Min_Average_Potential_Latent_Heat_Flux', 'FMODIS006_MOD16A2_SD_Min_Average_Potential_Latent_Heat_Flux', 'NASA_NOAH01_Max_Latent_Heat_Net_Flux', 'NASA_NOAH01_SD_Max_Latent_Heat_Net_Flux', 'NASA_NOAH01_Mean_Latent_Heat_Net_Flux', 'NASA_NOAH01_SD_Mean_Latent_Heat_Net_Flux', 'NASA_NOAH01_Min_Latent_Heat_Net_Flux', 'NASA_NOAH01_SD_Min_Latent_Heat_Net_Flux', 'FMODIS006_MOD16A2_Range_Average_Latent_Heat_Flux', 'FMODIS006_MOD16A2_SD_Range_Average_Latent_Heat_Flux', 'FMODIS006_MOD16A2_Range_Average_Potential_Latent_Heat_Flux', 'FMODIS006_MOD16A2_SD_Range_Average_Potential_Latent_Heat_Flux', 'NASA_NOAH01_Range_Latent_Heat_Net_Flux', 'NASA_NOAH01_SD_Rang



['ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_Max_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_SD_Max_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_Mean_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_SD_Mean_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_Min_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_SD_Min_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_Max_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_SD_Max_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_Mean_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_SD_Mean_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_Min_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level2_SD_Min_Temperature_7cm_28cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level3_Max_Temperature_28cm_100cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level3_SD_Max_Temperature_28cm_100cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level3_Mean_Temperature_28cm_100cm', 'ECMWF_ERA5_L



['NASA_NOAH01_Max_Soil_Heat_Flux', 'NASA_NOAH01_SD_Max_Soil_Heat_Flux', 'NASA_NOAH01_Mean_Soil_Heat_Flux', 'NASA_NOAH01_SD_Mean_Soil_Heat_Flux', 'NASA_NOAH01_Min_Soil_Heat_Flux', 'NASA_NOAH01_SD_Min_Soil_Heat_Flux', 'NASA_NOAH01_Range_Soil_Heat_Flux', 'NASA_NOAH01_SD_Range_Soil_Heat_Flux']




['IDAHO_EPSCOR_TERRACLIMATE_Max_Wind_Speed_At10m', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Wind_Speed_At10m', 'IDAHO_EPSCOR_TERRACLIMATE_Mean_Wind_Speed_At10m', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Wind_Speed_At10m', 'IDAHO_EPSCOR_TERRACLIMATE_Min_Wind_Speed_At10m', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Wind_Speed_At10m', 'NASA_NOAH01_Max_Surface_Wind_Speed', 'NASA_NOAH01_SD_Max_Surface_Wind_Speed', 'NASA_NOAH01_Mean_Surface_Wind_Speed', 'NASA_NOAH01_SD_Mean_Surface_Wind_Speed', 'NASA_NOAH01_Min_Surface_Wind_Speed', 'NASA_NOAH01_SD_Min_Surface_Wind_Speed', 'IDAHO_EPSCOR_TERRACLIMATE_Range_Wind_Speed_At10m', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Range_Wind_Speed_At10m', 'NASA_NOAH01_Range_Surface_Wind_Speed', 'NASA_NOAH01_SD_Range_Surface_Wind_Speed']




['IDAHO_EPSCOR_TERRACLIMATE_Max_Soil_Moisture', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Soil_Moisture', 'IDAHO_EPSCOR_TERRACLIMATE_Mean_Soil_Moisture', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Soil_Moisture', 'IDAHO_EPSCOR_TERRACLIMATE_Min_Soil_Moisture', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Soil_Moisture', 'NASA_NOAH01_Max_Soil_Moisture_0cm_10cm', 'NASA_NOAH01_SD_Max_Soil_Moisture_0cm_10cm', 'NASA_NOAH01_Max_Soil_Moisture_100cm_200cm', 'NASA_NOAH01_SD_Max_Soil_Moisture_100cm_200cm', 'NASA_NOAH01_Max_Soil_Moisture_10cm_40cm', 'NASA_NOAH01_SD_Max_Soil_Moisture_10cm_40cm', 'NASA_NOAH01_Max_Soil_Moisture_40cm_100cm', 'NASA_NOAH01_SD_Max_Soil_Moisture_40cm_100cm', 'NASA_NOAH01_Mean_Soil_Moisture_0cm_10cm', 'NASA_NOAH01_SD_Mean_Soil_Moisture_0cm_10cm', 'NASA_NOAH01_Mean_Soil_Moisture_100cm_200cm', 'NASA_NOAH01_SD_Mean_Soil_Moisture_100cm_200cm', 'NASA_NOAH01_Mean_Soil_Moisture_10cm_40cm', 'NASA_NOAH01_SD_Mean_Soil_Moisture_10cm_40cm', 'NASA_NOAH01_Mean_Soil_Moisture_40cm_100cm', 'NASA_NOAH01_SD_Mean_Soil_



['NASA_NOAH01_Max_Surface_Pressure', 'NASA_NOAH01_SD_Max_Surface_Pressure', 'NASA_NOAH01_Mean_Surface_Pressure', 'NASA_NOAH01_SD_Mean_Surface_Pressure', 'NASA_NOAH01_Min_Surface_Pressure', 'NASA_NOAH01_SD_Min_Surface_Pressure', 'NASA_NOAH01_Range_Surface_Pressure', 'NASA_NOAH01_SD_Range_Surface_Pressure']




['IDAHO_EPSCOR_TERRACLIMATE_Max_Vapor_Pressure', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Vapor_Pressure', 'IDAHO_EPSCOR_TERRACLIMATE_Max_Vapor_Pressure_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Vapor_Pressure_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_Mean_Vapor_Pressure', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Vapor_Pressure', 'IDAHO_EPSCOR_TERRACLIMATE_Mean_Vapor_Pressure_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Vapor_Pressure_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_Min_Vapor_Pressure', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Vapor_Pressure', 'IDAHO_EPSCOR_TERRACLIMATE_Min_Vapor_Pressure_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Vapor_Pressure_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_Range_Vapor_Pressure', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Range_Vapor_Pressure', 'IDAHO_EPSCOR_TERRACLIMATE_Range_Vapor_Pressure_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Range_Vapor_Pressure_Deficit']




['NASA_NOAH01_Max_Baseflow_Groundwater_Runoff', 'NASA_NOAH01_SD_Max_Baseflow_Groundwater_Runoff', 'NASA_NOAH01_Mean_Baseflow_Groundwater_Runoff', 'NASA_NOAH01_SD_Mean_Baseflow_Groundwater_Runoff', 'NASA_NOAH01_Min_Baseflow_Groundwater_Runoff', 'NASA_NOAH01_SD_Min_Baseflow_Groundwater_Runoff', 'NASA_NOAH01_Range_Baseflow_Groundwater_Runoff', 'NASA_NOAH01_SD_Range_Baseflow_Groundwater_Runoff']




['NASA_NOAH01_Max_Sensible_Heat_Net_Flux', 'NASA_NOAH01_SD_Max_Sensible_Heat_Net_Flux', 'NASA_NOAH01_Mean_Sensible_Heat_Net_Flux', 'NASA_NOAH01_SD_Mean_Sensible_Heat_Net_Flux', 'NASA_NOAH01_Min_Sensible_Heat_Net_Flux', 'NASA_NOAH01_SD_Min_Sensible_Heat_Net_Flux', 'NASA_NOAH01_Range_Sensible_Heat_Net_Flux', 'NASA_NOAH01_SD_Range_Sensible_Heat_Net_Flux']




['NASA_NOAH01_Max_Specific_Humidity', 'NASA_NOAH01_SD_Max_Specific_Humidity', 'NASA_NOAH01_Mean_Specific_Humidity', 'NASA_NOAH01_SD_Mean_Specific_Humidity', 'NASA_NOAH01_Min_Specific_Humidity', 'NASA_NOAH01_SD_Min_Specific_Humidity', 'NASA_NOAH01_Range_Specific_Humidity', 'NASA_NOAH01_SD_Range_Specific_Humidity']




['ECMWF_ERA5_LAND_MONTHLY_Dew_Max_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_SD_Dew_Max_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_Dew_Mean_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_SD_Dew_Mean_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_Dew_Min_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_SD_Dew_Min_Temperature_2m', 'ECMWF_ERA5_MONTHLY_100_Percentile_Max_Dew_Temperature', 'ECMWF_ERA5_MONTHLY_SD_Max_Dew_Temperature', 'ECMWF_ERA5_MONTHLY_Mean_Dew_Temperature', 'ECMWF_ERA5_MONTHLY_SD_Mean_Dew_Temperature', 'ECMWF_ERA5_MONTHLY_0_Percentile_Min_Dew_Temperature', 'ECMWF_ERA5_MONTHLY_SD_Min_Dew_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_Dew_Range_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_SD_Dew_Range_Temperature_2m', 'ECMWF_ERA5_MONTHLY_0_Percentile_Range_Dew_Temperature', 'ECMWF_ERA5_MONTHLY_SD_Range_Dew_Temperature']




['Chicken_Ext_Density', 'Chicken_Ext_Density_SD', 'Chicken_Int_Density', 'Chicken_Int_Density_SD']




['geonode_gfdrrlab_Heatwave_intensity_returnperiod5y_Max', 'geonode_gfdrrlab_Heatwave_intensity_returnperiod5y_SD_Max', 'geonode_gfdrrlab_Heatwave_intensity_returnperiod5y_Min', 'geonode_gfdrrlab_Heatwave_intensity_returnperiod5y_SD_Min', 'geonode_gfdrrlab_Heatwave_intensity_returnperiod5y_Range', 'geonode_gfdrrlab_Heatwave_intensity_returnperiod5y_SD_Range']




['IDAHO_EPSCOR_TERRACLIMATE_Max_Climate_Water_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Max_Climate_Water_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_Mean_Climate_Water_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Mean_Climate_Water_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_Min_Climate_Water_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Min_Climate_Water_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_Range_Climate_Water_Deficit', 'IDAHO_EPSCOR_TERRACLIMATE_SD_Range_Climate_Water_Deficit']




['NASA_NOAH01_Max_Surface_Air_Temperature', 'NASA_NOAH01_SD_Max_Surface_Air_Temperature', 'NASA_NOAH01_Mean_Surface_Air_Temperature', 'NASA_NOAH01_SD_Mean_Surface_Air_Temperature', 'NASA_NOAH01_Min_Surface_Air_Temperature', 'NASA_NOAH01_SD_Min_Surface_Air_Temperature', 'NASA_NOAH01_Range_Surface_Air_Temperature', 'NASA_NOAH01_SD_Range_Surface_Air_Temperature']




['ECMWF_ERA5_LAND_MONTHLY_Dew_Max_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_SD_Dew_Max_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_Dew_Mean_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_SD_Dew_Mean_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_Dew_Min_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_SD_Dew_Min_Temperature_2m', 'ECMWF_ERA5_LAND_MONTHLY_100_Percentile_Max_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_SD_Max_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_Mean_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_SD_Mean_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_0_Percentile_Min_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_SD_Min_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_Skin_Max_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_Skin_SD_Max_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_Skin_Mean_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_Skin_SD_Mean_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_Skin_Min_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_Skin_SD_Min_Temperature', 'ECMWF_ERA5_LAND_MONTHLY_Soil_Level1_Max_Temperature_0cm_7cm', 'ECMWF_ERA5_LAND_MONTHLY_Soil



['Cropland_Sum', 'Cropland_SD', 'Pasture_Sum', 'Pasture_SD', 'Cattle_Sum', 'Cattle_SD', 'Chicken_Sum', 'Chicken_SD', 'Ducks_Sum', 'Ducks_SD', 'Goats_Sum', 'Goats_SD', 'Pigs_Sum', 'Pigs_SD', 'Sheep_Sum', 'Sheep_SD']




In [11]:
#Moreover, for each dimension (i.e., 'spi', 'et_actl', 'et_anom', 'rural_urban_pop', 'local_pop', 'agriculture', 'cropland', 'crops', 'pasture', 'livestock', 'delivered_calories'),
#reconstruction errors (i.e., Kullback-Leibler divergences) can be inspected upon reduction, but are not saved for later inspection.

listed_errors

[0.35377514362335205,
 0.1377606838941574,
 0.3676789700984955,
 0.1898609697818756,
 0.44450438022613525,
 0.7073799967765808,
 0.8255006670951843,
 0.473099946975708,
 0.8945324420928955,
 0.4635704457759857,
 0.3988777995109558,
 0.9546933770179749,
 0.8684432506561279,
 0.8103822469711304,
 0.6931465864181519,
 0.8103822469711304,
 0.4457632899284363,
 0.8461177349090576,
 0.7624890804290771,
 0.4164785146713257,
 0.5187272429466248,
 0.3218629062175751,
 0.8436087369918823,
 0.533602237701416,
 0.6787208318710327,
 0.7451314926147461,
 0.6882805228233337,
 0.5763139724731445,
 0.33205607533454895,
 0.533986508846283,
 -44.46063232421875]

In [12]:
reduced_features = pd.DataFrame(np.concatenate(data_list, axis=1), columns = names)
reduced_features

Unnamed: 0,Water_Runoff,Coastal,Landslide,Storm_Surface,Riverine,Precipitation,Evapotranspiration,Skin_Reservoir,Evaporation,Soil_Temperature,Radiative_Temperature,Soil_Water,LeafArea,Soil_Level,Latent_Heat,Soil_Level.1,Soil_Heat,Wind_Speed,Soil_Moisture,Surface_Pressure,Vapor_Pressure,Groundwater_Runoff,Sensible_Heat,Humidity,Dew,Density,Heatwave,Water_Deficit,Surface_Air,Temperature,Food
0,-9.785743,13.071511,-29.065561,-4.913549,6.574868,3.072558,-5.588446,-2.096827,3.877077,-5.948453,0.997379,2.913165,-7.723192,-5.287311,-10.494808,-5.287311,-11.066824,-0.558169,8.208737,-10.066192,12.081740,3.269737,-4.578318,11.598210,4.583045,-152.199020,18.577457,-6.741825,-4.564607,-4.090545,-7.880719e+36
1,-4.451058,4.962664,-23.626535,9.772707,0.807747,8.971795,-1.891566,-8.279265,5.505229,-0.574631,3.279706,5.816660,0.336246,1.458213,-4.301021,1.458213,4.864165,7.786690,-1.550001,0.436954,-4.797209,-5.704494,-1.902246,2.015493,2.307190,-161.501495,-4.089179,-1.686300,1.700426,2.290412,-7.880719e+36
2,-11.909466,11.712390,-29.582335,0.864251,6.113179,4.052352,-4.988898,-0.585704,3.359438,-5.382941,2.164553,2.320751,-7.690580,-3.499210,-11.179235,-3.499210,-12.173788,-2.301916,-12.775447,-4.481901,-8.562257,4.870506,-3.752325,-1.404066,4.024570,-150.578903,19.041609,-3.968739,-1.682609,-3.251885,-7.880719e+36
3,7.758137,15.811904,-16.693804,7.044371,10.137346,-7.126973,3.069797,4.485955,-6.747284,6.822594,10.565229,-6.824016,11.223374,9.457995,-2.520536,9.457995,-0.211180,2.437684,4.177406,8.717208,7.169052,10.985909,6.127324,-6.695104,-3.842552,-147.193863,-7.609262,7.824786,9.714209,8.119581,-7.880719e+36
4,-6.743314,-8.275314,-28.289974,-6.074180,-9.680875,6.680084,-7.311238,-8.077782,4.487056,-10.362889,-6.248820,1.274628,4.741628,-4.936482,-9.594470,-4.936482,10.050441,-8.056935,-3.178369,-7.056735,-4.403980,-8.238238,-0.136558,4.283571,7.555860,-150.895477,1.023686,-7.457350,-9.680089,-6.696109,-7.880719e+36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,-3.214485,2.588127,105.697723,3.687900,-3.637111,0.374221,-1.623459,-8.998659,-9.054413,-7.769559,-4.215418,7.750851,-3.169237,0.225873,-11.177357,0.225873,-9.105123,8.656487,-10.935192,-5.695917,-6.857419,1.831641,-2.797554,4.991419,-0.201390,-150.613419,-0.547425,-11.837461,-8.512124,-1.034662,-7.880719e+36
264,-2.088866,-7.943555,-24.605865,3.153948,-10.300833,-2.507312,0.813194,-7.614503,-5.791650,0.804240,4.276148,-1.525895,-8.203878,2.128263,0.726843,2.128263,-7.577359,0.450656,0.294572,2.718399,3.836237,4.952452,5.546475,8.837021,0.647893,-161.825607,5.863933,3.777467,3.053351,3.148316,-7.880719e+36
265,-6.108713,5.823532,106.002380,-9.769992,1.107011,2.518385,-5.584548,0.394428,2.808425,-2.500868,-1.489980,3.658881,-6.656016,-3.369817,-12.277737,-3.369817,11.834444,-2.220685,-6.867601,-11.812325,-7.775321,-7.856731,-7.869524,1.087065,2.833279,-163.234970,9.920849,-3.587361,-2.612592,-2.351717,-7.880719e+36
266,8.186574,-4.825428,-19.113449,11.183458,1.367947,-9.118014,4.459762,8.672816,-6.498763,9.030070,11.925438,-6.072948,11.463962,8.737226,-2.361354,8.737226,-1.162143,4.136829,6.227865,9.420376,8.175014,10.051997,6.654664,-6.426256,-7.458625,-154.061096,-9.073239,9.911867,11.274250,10.432647,-7.880719e+36


In [15]:
non_predictive_data = prepared_data.loc[:, prepared_data.columns.isin(non_predictive_columns)]

In [16]:
frames = [reduced_features, non_predictive_data]
df_cross_section = pd.concat(frames, axis = 1)
df_cross_section 

Unnamed: 0,Water_Runoff,Coastal,Landslide,Storm_Surface,Riverine,Precipitation,Evapotranspiration,Skin_Reservoir,Evaporation,Soil_Temperature,Radiative_Temperature,Soil_Water,LeafArea,Soil_Level,Latent_Heat,Soil_Level.1,Soil_Heat,Wind_Speed,Soil_Moisture,Surface_Pressure,Vapor_Pressure,Groundwater_Runoff,Sensible_Heat,Humidity,Dew,Density,Heatwave,Water_Deficit,Surface_Air,Temperature,Food,Battles,Explosions/Remote violence,Protests,Riots,Strategic developments,Violence against civilians,total_event_types,"fatalities, Battles","fatalities, Explosions/Remote violence","fatalities, Protests","fatalities, Riots","fatalities, Strategic developments","fatalities, Violence against civilians","total_fatalities,"
0,-9.785743,13.071511,-29.065561,-4.913549,6.574868,3.072558,-5.588446,-2.096827,3.877077,-5.948453,0.997379,2.913165,-7.723192,-5.287311,-10.494808,-5.287311,-11.066824,-0.558169,8.208737,-10.066192,12.081740,3.269737,-4.578318,11.598210,4.583045,-152.199020,18.577457,-6.741825,-4.564607,-4.090545,-7.880719e+36,0,0,4,0,0,2,6,0,0,0,0,0,2,2
1,-4.451058,4.962664,-23.626535,9.772707,0.807747,8.971795,-1.891566,-8.279265,5.505229,-0.574631,3.279706,5.816660,0.336246,1.458213,-4.301021,1.458213,4.864165,7.786690,-1.550001,0.436954,-4.797209,-5.704494,-1.902246,2.015493,2.307190,-161.501495,-4.089179,-1.686300,1.700426,2.290412,-7.880719e+36,6,4,0,0,1,2,13,3,5,0,0,0,1,9
2,-11.909466,11.712390,-29.582335,0.864251,6.113179,4.052352,-4.988898,-0.585704,3.359438,-5.382941,2.164553,2.320751,-7.690580,-3.499210,-11.179235,-3.499210,-12.173788,-2.301916,-12.775447,-4.481901,-8.562257,4.870506,-3.752325,-1.404066,4.024570,-150.578903,19.041609,-3.968739,-1.682609,-3.251885,-7.880719e+36,10,36,1,0,7,7,61,31,30,0,0,0,9,70
3,7.758137,15.811904,-16.693804,7.044371,10.137346,-7.126973,3.069797,4.485955,-6.747284,6.822594,10.565229,-6.824016,11.223374,9.457995,-2.520536,9.457995,-0.211180,2.437684,4.177406,8.717208,7.169052,10.985909,6.127324,-6.695104,-3.842552,-147.193863,-7.609262,7.824786,9.714209,8.119581,-7.880719e+36,0,1,2,0,1,0,4,0,2,0,0,0,0,2
4,-6.743314,-8.275314,-28.289974,-6.074180,-9.680875,6.680084,-7.311238,-8.077782,4.487056,-10.362889,-6.248820,1.274628,4.741628,-4.936482,-9.594470,-4.936482,10.050441,-8.056935,-3.178369,-7.056735,-4.403980,-8.238238,-0.136558,4.283571,7.555860,-150.895477,1.023686,-7.457350,-9.680089,-6.696109,-7.880719e+36,4,5,3,13,2,5,32,4,0,0,1,0,3,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,-3.214485,2.588127,105.697723,3.687900,-3.637111,0.374221,-1.623459,-8.998659,-9.054413,-7.769559,-4.215418,7.750851,-3.169237,0.225873,-11.177357,0.225873,-9.105123,8.656487,-10.935192,-5.695917,-6.857419,1.831641,-2.797554,4.991419,-0.201390,-150.613419,-0.547425,-11.837461,-8.512124,-1.034662,-7.880719e+36,0,0,1,0,0,0,1,0,0,0,0,0,0,0
264,-2.088866,-7.943555,-24.605865,3.153948,-10.300833,-2.507312,0.813194,-7.614503,-5.791650,0.804240,4.276148,-1.525895,-8.203878,2.128263,0.726843,2.128263,-7.577359,0.450656,0.294572,2.718399,3.836237,4.952452,5.546475,8.837021,0.647893,-161.825607,5.863933,3.777467,3.053351,3.148316,-7.880719e+36,18,18,0,0,14,2,52,97,61,0,0,3,1,162
265,-6.108713,5.823532,106.002380,-9.769992,1.107011,2.518385,-5.584548,0.394428,2.808425,-2.500868,-1.489980,3.658881,-6.656016,-3.369817,-12.277737,-3.369817,11.834444,-2.220685,-6.867601,-11.812325,-7.775321,-7.856731,-7.869524,1.087065,2.833279,-163.234970,9.920849,-3.587361,-2.612592,-2.351717,-7.880719e+36,45,15,4,0,4,5,73,35,4,0,0,0,5,44
266,8.186574,-4.825428,-19.113449,11.183458,1.367947,-9.118014,4.459762,8.672816,-6.498763,9.030070,11.925438,-6.072948,11.463962,8.737226,-2.361354,8.737226,-1.162143,4.136829,6.227865,9.420376,8.175014,10.051997,6.654664,-6.426256,-7.458625,-154.061096,-9.073239,9.911867,11.274250,10.432647,-7.880719e+36,1,3,1,0,0,0,5,0,9,0,0,0,0,9


In [28]:
df_cross_section 
df_cross_section.to_csv(root+"FCDO_data_dim_reduced.csv") # Import prepared data

### **Run Random Forest on Dimensionality reduced data**

In [21]:
class ForecastModel:
    def __init__(self, data):
        """
        :param data:
        """
        self.logger = logging.getLogger(__name__)
        self.fatalities_columns = ['Year Made','Machine Size','Model Description','Auctioneer ID', 'Model ID','Coupler']
        self.data = data
        self.dependent_variable = 'total_fatalities, '
        self.non_prediction_variables = ['ADM3_EN','total_fatalities, ']
        self.feature_performance = pd.DataFrame(columns={'Feature_name', 'Feature_importance'})

    def calculate(self):
        """
        This functions starts the individual functions for running and evaluating the machine learning model
        :return:
        """
        self.logger.info("start training and predicting module")

        # Split the data set into train and test
        self.split_data()
        # train the model
        self.fit_model()
        # Make predictions after training
        self.predict_model()
        # Return the R2 ans RMSE performance of the model
        self.return_performance()
        # Return the performance of the individual features
        self.feature_scoring()

        return self.R2, self.RMSE, self.feature_performance

    def split_data(self):
        """
        Splits data into training and test (use 'rule of thumb' 4/5 for training)
        :return:
        """
        self.logger.info("start splitting data")
        X = self.data.loc[:,~(self.data.columns.isin(self.non_prediction_variables))]
        y = self.data.loc[:,self.dependent_variable]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=3)

    def fit_model(self):
        """
        Fit the model
        :return:
        """
        self.logger.info("start fitting model")
        # No additional parameters given
        model = RandomForestRegressor()
        model.fit(self.X_train , self.y_train)
        self.model=model

    def predict_model(self):
        """
        Make predictions with the trained model.
        """
        self.logger.info("start predicting model")
        self.y_test_predict = self.model.predict(self.X_test)

    def return_performance(self):
        """
        Return overal performance in the form of RMSE and R2 score.
        """
        self.logger.info("start returning performance")
        self.RMSE = (np.sqrt(mean_squared_error(self.y_test, self.y_test_predict)))
        self.R2 = (r2_score(self.y_test, self.y_test_predict))

    def feature_scoring(self):
        """
        Return feature performance by inbuilt function.
        """
        self.logger.info("start returning feature scoring")
        feature_performance = self.feature_performance
        feature_performance['Feature_importance'] = self.model.feature_importances_
        for i in range(0,len(feature_performance)):
            feature_performance.loc[i,'Feature_name']=self.X_train.columns[i]

        self.feature_performance=feature_performance



In [26]:
acled_stripped_dim_reduced =df_cross_section.loc[:, ~df_cross_section.columns.isin(['fatalities, Riots', 'fatalities, Battles', 'fatalities, Protests',
                                      'Battles','Explosions/Remote violence','Protests','Riots','Strategic developments',
                                      'Violence against civilians','total_event_types','fatalities, Explosions/Remote violence',
                                      'fatalities, Strategic developments','fatalities, Violence against civilians'])]

RF_model = ForecastModel(acled_stripped_dim_reduced)
R2, RMSE, feature_performance=RF_model.calculate()

2021-12-21 16:27:58,348 - __main__ - INFO - calculate - start training and predicting module
2021-12-21 16:27:58,356 - __main__ - INFO - split_data - start splitting data
2021-12-21 16:27:58,367 - __main__ - INFO - fit_model - start fitting model
2021-12-21 16:27:59,594 - __main__ - INFO - predict_model - start predicting model
2021-12-21 16:27:59,627 - __main__ - INFO - return_performance - start returning performance
2021-12-21 16:27:59,636 - __main__ - INFO - feature_scoring - start returning feature scoring


In [27]:
feature_performance

Unnamed: 0,Feature_importance,Feature_name
0,0.140146,Water_Runoff
1,0.021254,Coastal
2,0.02604,Landslide
3,0.021641,Storm_Surface
4,0.016791,Riverine
5,0.064362,Precipitation
6,0.009271,Evapotranspiration
7,0.031424,Skin_Reservoir
8,0.031573,Evaporation
9,0.013696,Soil_Temperature


In [36]:
feature_performance.sort_values('Feature_importance', ascending=False, inplace=True)
feature_performance.loc[:7,'Feature_name'].to_list()

['Water_Runoff',
 'Soil_Level',
 'Soil_Water',
 'Precipitation',
 'Soil_Level',
 'Groundwater_Runoff',
 'Temperature',
 'Latent_Heat',
 'Soil_Moisture',
 'Evaporation',
 'Skin_Reservoir']