In [1]:
import numpy as np 
import math
import pandas as pd 
from pathlib import Path
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, LeaveOneOut, cross_val_score

In [2]:
# load all datasets
def load_created_datasets():
    data_list = []
    DATA_PATH = Path('./data/created/')
    
    file_paths = DATA_PATH.glob('*.csv')
    for file_path in file_paths:
        df = pd.read_csv(file_path, index_col='Country')
        data_list.append(df)
    return pd.concat(data_list, join='inner', axis=1)
complete_data = load_created_datasets()

In [3]:
columns_remove  = ['Decreasing', 'Increasing', 'Stable', 'Unnamed: 0']
rf_data = complete_data.copy()
rf_data = rf_data.drop(columns=columns_remove)
rf_data.shape

(42, 40)

## Random Forest Regression
We perform random forest regression here to predict the outcome of our relative threatened species for all groups combined and for each group separately. As a random forest is a tree partitioning algorithm it does by nature not need any scaling of the data beforehand. After the results of our correlation analysis we start with the full model here.
As the number of samples in our remaining data frame is quite limited, we use leave one out cross validation to measure the performance of our regressor rather than a train/test split.Because of the high dimension of the data, we take the square root of the total number of features to pick for every split.

In [None]:
def rf_predict_threatened_relative(data):
    results = {}

    # iterate all target variables
    columns_threatened = [col for col in data.columns if col.endswith('threatened')]
    for target in columns_threatened:
        y = data[target]
        X = data.drop(columns=columns_threatened)


        # n_estimator(number of trees) is the hyperparameter that we try to optimize here
        n_estimators = [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)]
        grid = random_grid = {'n_estimators': n_estimators}
        rf=RandomForestRegressor(max_features = 'sqrt', random_state=0)
        rf_grid = GridSearchCV(estimator = rf, param_grid = random_grid,  cv = LeaveOneOut(),  scoring='neg_mean_squared_error', refit='neg_mean_squared_error')
        rf_grid.fit(X,y)    
        results[target] = round(math.sqrt(-rf_grid.best_score_),5)

    return results
RMSE_rf = rf_predict_threatened_relative(complete_data)
RMSE_rf

## Conclusion
As performance metric we used the RMSE here. As the range of our threatened species percentage of all groups is from 1.7%-16.7%, a RMSE of 5% is a pretty big offset. For amphibians the RMSE even suggests a mean 16% offset (maximum for all targets). These values indicate rather bad predictions that suggest that our model is not very well suited to predict the percentage of threatened animals in a country.