The purpose of this notebook is to serve as the canonical example of the Random Forest regression approach for this project for demonstration purposes. It emulates the following run of `run_experiments.py`:

    python3 run_experiments.py ../data/plants5.csv migration_m --algo RF --cats oceanity dispersal_mode BreedSysCode Grime --drop Taxon migr_sterr_m "shift + 2SE" signif_shift signif_shift2 dispmode01 DispModeEng "shift + 2SE" --benchmark --save --na feature

In [1]:
import TraitData

In [19]:
# DATAFILE = "../data/plants5.csv"
# RESPONSE = "migration_m"
# CATEGORIES = [
#      'oceanity',
#      'dispersal_mode',
#      'BreedSysCode',
#      'Grime'
# ]
# DROPVARS = [
#     "Taxon",
#     "migr_sterr_m",
#     "shift + 2SE",
#     "signif_shift",
#     "signif_shift2",
#     "DispModeEng"
# ]

DATAFILE = "../data/Dragonflies01.csv"
RESPONSE = "Change_range_size"
CATEGORIES = [
    "habitat_old",
    "R_size_qual",
    "R_type"
]
DROPVARS = [
    "Fliers",
    "habitat_eggs",
    "mass_migrants",
    "Taxon",
    "PhyloCode",
    "Margin_shift_km"
]
DROPNA = 1 ## drop features (0 for drop samples)

data = TraitData.TraitData(DATAFILE,
                           RESPONSE,
                           DROPVARS, 
                           CATEGORIES,
                           DROPNA)
X, x_test, Y, y_test = data.train_test_split(0.30)


data.X.columns.values

array(['body_size', 'Fliers01', 'eggs01', 'flight_start', 'flight_end',
       'flight_length', 'gen_time', 'migrants01', 'Bio1_mean', 'Bio1_std',
       'Bio1_var', 'LentLot01', 'LentLot', 'LentLot2', 'LentLot3',
       'LentLot4', 'Sbound_lat', 'Nbound_lat', 'Lat_center',
       'R_size_lat_span', 'R_size_count', 'habitat_old_1', 'habitat_old_2',
       'habitat_old_3', 'habitat_old_4', 'habitat_old_5', 'habitat_old_6',
       'habitat_old_7', 'R_size_qual_L', 'R_size_qual_M', 'R_type_S',
       'R_type_U'], dtype=object)

In [20]:
from sklearn.model_selection import KFold, cross_val_score, LeaveOneOut

def kFoldCV(model, features, target, K=5):
    """
        Perform 5-fold cross-validation on samples
    """
    return -cross_val_score(model, features, target,
                            cv=KFold(K),
                            scoring='neg_mean_squared_error', n_jobs=1).mean()
def LeaveOneOutCV(model, features, target):
    """
        Perform leave-one-out cross validation on samples
    """
    return -cross_val_score(model, features, target,
                           cv=LeaveOneOut(),
                           scoring="neg_mean_squared_error").mean()

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import scale
baseModel = RandomForestRegressor()

In [22]:

print("MSE on all data (5-fold): ", kFoldCV(baseModel, scale(data.X), data.Y))

MSE on all data (5-fold):  1670.54166667


In [23]:
print("MSE on all data (LOO): ", LeaveOneOutCV(baseModel, scale(data.X), data.Y))

MSE on all data (LOO):  1521.60606061


For random forests we can also look at feature importances:

In [24]:
import numpy as np
def importances(weights, features):
        """
        sorts features array based on weights.
        """
        assert(len(weights) == len(features.columns))
        sorted_importances = np.argsort(weights)
        return list(zip(features.columns.values[sorted_importances[::-1]],
                    weights[sorted_importances[::-1]]))

In [25]:
baseModel.fit(scale(data.X), data.Y)
importances(baseModel.feature_importances_, data.X)

[('flight_length', 0.2898488568849687),
 ('R_size_count', 0.24482376258651958),
 ('flight_end', 0.1760610783259744),
 ('Bio1_var', 0.060456648832816498),
 ('Bio1_std', 0.031237713814253061),
 ('R_size_lat_span', 0.030618950133644497),
 ('body_size', 0.0304760725827103),
 ('LentLot', 0.027472154363892755),
 ('R_size_qual_M', 0.027218436015662435),
 ('flight_start', 0.015074684622513018),
 ('Lat_center', 0.012562950779918286),
 ('eggs01', 0.012199375104313969),
 ('migrants01', 0.010934398003344786),
 ('Nbound_lat', 0.0085579090052472113),
 ('Bio1_mean', 0.0081502726246154272),
 ('Sbound_lat', 0.0072886144302551723),
 ('LentLot3', 0.00167625076392753),
 ('habitat_old_1', 0.0010866866274594992),
 ('habitat_old_3', 0.0010612966852310065),
 ('R_type_U', 0.00087334157677592501),
 ('habitat_old_7', 0.00077452898583509435),
 ('habitat_old_6', 0.00059328765326791325),
 ('LentLot2', 0.00037254410091915166),
 ('R_type_S', 0.00032364862508210159),
 ('habitat_old_2', 0.00015007269957764926),
 ('gen_