
## DESCRIPTION

This notebook fits a Random Survival Forest

***
### SETUP

Load the setup script

In [1]:
from utils.common_setup import *

Import the cleaned dataset

In [2]:
data_folder = os.path.join(os.getcwd(), '..', 'data')
data = pd.read_csv(os.path.join(data_folder, 'clean_data.csv'))

### DEFINE HYPERPARAMETERS

In [3]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1250, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 60, num = 10)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [10, 20, 40, 60]

# Minimum number of samples required at each leaf node
min_samples_leaf = [10, 20, 40, 60]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

### SELECT FEATURES

In [5]:
features = [
    'aqhi_average_og', 'tavg_average_og', 'tavg_max', 'tavg_skew'
    ,'wspd_average_og', 'wspd_skew', 'wspd_max', 'ndvi_skew', 'ndvi_max', 'ndvi_average'
    ,'prcp_average_og', 'prcp_skew', 'prcp_max'
]

data.season.value_counts()

season
2023    84056
2022    22640
2021     5930
Name: count, dtype: int64

Prepare features object

In [None]:
# One-hot enconding of region
gr = pd.get_dummies(data['region'])

# Drop NAs
X = data.dropna(subset = ['death_next_season', 'hive_age_next_season'], axis = 0)
X.fillna(0, inplace = True)

# Features object adding region
X = X[features]
X = pd.concat([X, gr], axis = 1)

# Convert y to boolean
data['death_next_season'] = data['death_next_season'].astype(bool)

#### SPLIT DATA

- Train and test
- Transform

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    np.array(data.apply(lambda x: (x["death_next_season"], x["hive_age_next_season"]), axis = 1).tolist(), dtype = [('cens', '?'), ('time', '<f8')]),
    test_size = 0.15,
    random_state = 8
)

# Apply min-max transform
mx = StandardScaler()
X_train = mx.fit_transform(X_train)
X_test = mx.transform(X_test)

### FIT THE RANDOM SURVIVAL FOREST

In [7]:
rf = RandomSurvivalForest()

rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 5,
    cv = 3,
    verbose = 1,
    random_state = 8,
    n_jobs = -1,
)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


Save the best model

In [None]:
project_root = os.path.dirname(os.getcwd())
output_dir = os.path.join(project_root, 'outputs')    
filename = os.path.join(output_dir, "best_RSF.pkl.pkl")

joblib.dump(rf_random.best_estimator_, filename)

Retrieve best model

In [None]:
rsf = rf_random.best_estimator_