# Analysis of Random Forest -- Notebook vers.

Because spyder kept crashing...

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 12 09:32:37 2024

@author: krish
"""

#%% Libraries

import numpy as np
import pandas as pd
import os 
import sys
sys.path.append('/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/au_dyanamic_vegetation_project/STEP9_DATA_MODELLING_AND_EXPLORATION')
import matplotlib.pyplot as plt

from joblib import load
from sklearn.inspection import permutation_importance
from sklearn.metrics import root_mean_squared_error, mean_squared_error

In [2]:
#%% Functions

def plotPredictions(df, TARGET, n_folds = 10, directory_plot_output = '', msg = '', split = '', fire_split = ''):
    fig, ax = plt.subplots(nrows = 3, figsize = (15,10))
    fig.suptitle(msg, fontsize=30)
    
    for i,v in enumerate(TARGET):
        df[v].plot(ax=ax[i], color = 'blue', alpha = 0.4, linestyle='dashed', label = f'Observed {v.split("_")[0]}')
        for folder_num in range(n_folds):
            df[f'{v}_prediction_{folder_num + 1}'].plot(ax=ax[i], ylim = (0,100), label = f'Modelled {v.split("_")[0]}')
        ax[i].legend()
        ax[i].grid(True)
        if split:
            for s in split:
                ax[i].axvline(s, color='black', ls='--')
        if fire_split:
            for f in fire_split:
                ax[i].axvline(f, color='red', ls='--')
                
    # Plot the mean/median
    fig, ax = plt.subplots(nrows = 3, figsize = (15,10))
    fig.suptitle(msg, fontsize=30)
    
    for i,v in enumerate(TARGET):
        df[v].plot(ax=ax[i], color = 'blue', alpha = 0.4, linestyle='dashed', label = f'Observed {v.split("_")[0]}')
        df[f'{v.split("_")[0]}_mean'].plot(ax=ax[i], ylim = (0,100), label = f'Modelled {v.split("_")[0]}')
        df[f'{v.split("_")[0]}_median'].plot(ax=ax[i], ylim = (0,100), label = f'Modelled {v.split("_")[0]}')
        ax[i].legend()
        ax[i].grid(True)
        if split:
            for s in split:
                ax[i].axvline(s, color='black', ls='--')
        if fire_split:
            for f in fire_split:
                ax[i].axvline(f, color='red', ls='--')
    
    
    if directory_plot_output:
        plt.savefig(fname = directory_plot_output)
        plt.close()
        
def make_directory(path):
    made_path = False
    if os.path.exists(path) == False:
        os.makedirs(path)
        made_path = True
    return made_path

In [3]:
 
#%% Features
TARGET = ['pv_filter', 'npv_filter', 'bs_filter'] # Only needed target variables 
results_dir = 'D:/Krish_New/Dynamic_Vegetation_Project_Storage/Random_Forest_Results_On_Super_Group_Results'
results_dir = 'C:/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/au_dyanamic_vegetation_project/RESULTS/Random_Forest_Results_On_Super_Group_Results_new'

directory = 'C:/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/au_dyanamic_vegetation_project/DATASETS/MODELLED_TRAINING_DATA'


super_group_list = ['Desert Chenopod', 'Desert Forb', 'Desert Hummock.grass',
       'Desert Shrub', 'Desert Tree.Palm', 'Desert Tussock.grass',
       'Temp/Med Shrub', 'Temp/Med Tree.Palm', 'Temp/Med Tussock.grass',
       'Tropical/Savanna Tree.Palm', 'Tropical/Savanna Tussock.grass']

max_counter = len(super_group_list)

In [4]:
random_state = 20240808
targeted_group = super_group_list

features_included = dict()
for counter, s in enumerate(targeted_group):
    
    print(f'Obtaining results from {s} sites ({counter + 1}/{max_counter})')
    super_group_folder_name = '_'.join(s.split('/')) 
    directory_val = f'{directory}/{super_group_folder_name}/Validation' 
    number_of_folds = len(os.listdir(directory_val)) # note; this assumes that each folder in this directory is a K-Fold
    
    training_set = pd.read_csv(f'{directory}/{super_group_folder_name}/Training/{super_group_folder_name}_Train_Set.csv', 
                               index_col = ['time'], parse_dates = ['time']).copy()
    test_set = pd.read_csv(f'{directory}/{super_group_folder_name}/Test/{super_group_folder_name}_Test_Set.csv',
                               index_col = ['time'], parse_dates = ['time']).copy()
    
    for i in range(number_of_folds):
        folder_num = i+1
        
        print(f'Load the random forest ({folder_num}/{number_of_folds})')
        
        rf_model = load(f'{results_dir}/{super_group_folder_name}/Results/KFold_{folder_num}/Random_Forest.joblib')
        
        pred_names = []
        for t in TARGET:
            pred_names.append(f'{t}_prediction_{folder_num}')
            
        test_set[pred_names] = rf_model.predict(X = test_set[rf_model.feature_names_in_])
        print(rf_model.feature_names_in_)
        
        features_included[i] = list(rf_model.feature_names_in_)
        
        #Get permutation importances 
        n_repeats = 100
        print(f'Run permutation test with RF ({folder_num}/{number_of_folds})')
        perm_importance = permutation_importance(rf_model, test_set[rf_model.feature_names_in_], test_set[TARGET],
                                                 n_repeats=n_repeats, random_state = random_state,
                                                 scoring = 'neg_mean_squared_error')
        
        arr_importances = np.array([list(perm_importance['importances_mean']), list(perm_importance['importances_std'])]).T
        perm_importance_df_2 = pd.DataFrame(arr_importances, columns = ['importances_mean', 'importances_std'], index = rf_model.feature_names_in_)
        perm_importance_df_2.sort_values('importances_mean', ascending = True, inplace = True)
        
        
        var_path = f'{results_dir}/{super_group_folder_name}/Results/Variable_Importances'
        make_directory(var_path)
        perm_importance_df_2.to_csv(f'{var_path}/KFold_{folder_num}_RF_VariableImportance_100repeats.csv')
        
        #test_pred_path = f'{results_dir}/{super_group_folder_name}/Results/Test_Predictions'
        #make_directory(test_pred_path)
        #test_set.to_csv(f'{test_pred_path}/test_predictions.csv')
    
        

Obtaining results from Desert Chenopod sites (1/11)
Load the random forest (1/10)
['MAP' 'CO2' 'SLGA_3' 'aspect_1s' 'VPD_30' 'DER_000_999' 'precip_365'
 'precip_90' 'VPD_7' 'tmax_30']
Run permutation test with RF (1/10)
Load the random forest (2/10)
['MAP' 'VPD_30' 'SLGA_3' 'aspect_1s' 'tmax_30' 'VPD_14' 'VPD_lag'
 'fire_severity' 'tmax_14' 'tmin_14']
Run permutation test with RF (2/10)
Load the random forest (3/10)
['MAP' 'CO2' 'SLGA_2' 'MAT' 'aspect_1s' 'VPD_30' 'photoperiod' 'precip_90'
 'precip_365' 'precip_180' 'tmin_7' 'VPD_7']
Run permutation test with RF (3/10)
Load the random forest (4/10)
['VPD_30' 'CO2' 'SLGA_3' 'twi_1s' 'DER_000_999' 'VPD_14' 'precip_90'
 'tmin_lag' 'tmin_7' 'precip_180' 'VPD_lag']
Run permutation test with RF (4/10)
Load the random forest (5/10)
['MAP' 'CO2' 'SLGA_3' 'days_since_fire']
Run permutation test with RF (5/10)
Load the random forest (6/10)
['MAP' 'CO2' 'SLGA_3' 'SLGA_2' 'VPD_7' 'precip_90' 'VPD_14']
Run permutation test with RF (6/10)
Load the r