
## DESCRIPTION

This notebook fits a series of generalized linear mixed models

***
### SETUP

Load the setup script

In [20]:
from utils.common_setup import *

Load the functions to :

- Fit the models
- Save the models
- Load the models

In [3]:
from utils.functions import fit_models, save_fitted_models, load_fitted_models

Import the cleaned dataset

In [4]:
data_folder = os.path.join(os.getcwd(), '..', 'data')
data = pd.read_csv(os.path.join(data_folder, 'clean_data.csv'))

### IF PackageNotInstalledError: The R package "lme4" is not installed. 


See README for instruction


```
from rpy2.robjects.packages import importr


utils = importr('utils')
utils.chooseCRANmirror(ind=12)
utils.install_packages('lme4')
```

### FIT GLMM

### Define model formulas

In [5]:
model_formulas = [
    # model0
    "death_next_season ~ 1 + (1|region)",

    # model
    "death_next_season ~ aqhi_average * ndvi_average + (aqhi_average|region)",

    # model2
    "death_next_season ~ aqhi_average + ndvi_average + (aqhi_average|region)",

    # model4
    "death_next_season ~ aqhi_average + ndvi_average + tavg_average + prcp_average + wspd_average + (aqhi_average|region)",

    # model5
    "death_next_season ~ aqhi_average * wspd_average + ndvi_average + tavg_average + prcp_average + (aqhi_average|region)",

    # model6
    "death_next_season ~ aqhi_average * wspd_average * tavg_average + (aqhi_average|region)",

    # model7a
    "death_next_season ~ aqhi_average * wspd_average * ndvi_average + (ndvi_average|region)",

    # model7
    "death_next_season ~ aqhi_average * ndvi_average + wspd_average + (ndvi_average|region)",

    # model9
    "death_next_season ~ aqhi_skew * ndvi_average * wspd_average + (aqhi_skew|region)",

    # model10
    "death_next_season ~ aqhi_skew * ndvi_skew + (aqhi_skew|region)",

    # model11
    "death_next_season ~ aqhi_skew * ndvi_average + wspd_average + (aqhi_skew|region)",

    # model12
    "death_next_season ~ aqhi_max * ndvi_average + wspd_average + (ndvi_average|region)",

    # model14
    "death_next_season ~ aqhi_max * ndvi_average * wspd_average + (ndvi_average|region)",

    # model15
    "death_next_season ~ aqhi_average * ndvi_skew * wspd_average + (aqhi_average|region)",

    # model16
    "death_next_season ~ aqhi_average * ndvi_skew + wspd_average + (aqhi_average|region)",

    # model17
    "death_next_season ~ aqhi_average * ndvi_skew * wspd_average + (aqhi_average|operation_id)",

    # model18
    "death_next_season ~ aqhi_max * ndvi_average + aqhi_average * wspd_average + (aqhi_average|operation_id)",

    # model19
    "death_next_season ~ ndvi_average * aqhi_average + (aqhi_average|operation_id)",

    # model20
    "death_next_season ~ ndvi_average * aqhi_average * wspd_average + (aqhi_average|operation_id)",

    # model21
    "death_next_season ~ ndvi_average * aqhi_average + (1|operation_id)",

    # model22
    "death_next_season ~ ndvi_average * aqhi_average * wspd_average + (1|operation_id)",

    # model23
    "death_next_season ~ ndvi_average * aqhi_average + wspd_average + (1|operation_id)",

    # model24
    "death_next_season ~ ndvi_average + aqhi_average + wspd_average + (1|operation_id)",

    # model25
    "death_next_season ~ ndvi_average + aqhi_average + wspd_average + (aqhi_average|operation_id)",

    # model26
    "death_next_season ~ ndvi_average + aqhi_average * wspd_average + (aqhi_average|operation_id)",

    # model27
    "death_next_season ~ ndvi_average + o3_average + (o3_average|operation_id)",

    # model28
    "death_next_season ~ ndvi_average * o3_average + (o3_average|operation_id)",

    # model29 (best model)
    "death_next_season ~ ndvi_average * o3_average + wspd_average + (o3_average|operation_id)",

    # model30
    "death_next_season ~ ndvi_average * o3_average * wspd_average + (o3_average|operation_id)",

    # model31
    "death_next_season ~ ndvi_average + aqhi_average + wspd_average + tavg_average + prcp_average + (aqhi_average|operation_id)",

    # model33
    "death_next_season ~ ndvi_average * aqhi_average + aqhi_max * wspd_average + tavg_average * prcp_average + (aqhi_average|operation_id)"
]

Inspect how many models

In [13]:
len(model_formulas)

31

In [15]:
data.columns

Index(['hid', 'creation_date', 'operation_id', 'death_date', 'season',
       'season_start_month', 'season_start_day', 'death_next_season',
       'hive_age_next_season', 'aqhi_average', 'prcp_average', 'wspd_average',
       'ndvi_average', 'tavg_average', 'o3_average', 'aqhi_skew', 'prcp_skew',
       'wspd_skew', 'ndvi_skew', 'tavg_skew', 'o3_skew', 'aqhi_max',
       'prcp_max', 'wspd_max', 'ndvi_max', 'tavg_max', 'o3_max', 'region',
       'wspd_average_og', 'tavg_average_og', 'prcp_average_og',
       'aqhi_average_og'],
      dtype='object')

### Fit the models and save (**run once**)

Fitting all models takes **approximately 3 hours.**

Run once and save in ```aqi/outputs``` folder so you can reimport in session.

File is called ```fitted_GLMMs.pkl```

In [11]:
#fitted_models, model_summaries = fit_models(
#    data=data,
#    formulas=model_formulas
#)

#save_fitted_models(
#    fitted_models=fitted_models,
#    filename="fitted_GLMMs.pkl"
#)

#save_fitted_models(
#    fitted_models=model_summaries,
#    filename="summaries_GLMMs.pkl"
#)

### Load models in session

In [10]:
fitted_models = load_fitted_models("fitted_GLMMs.pkl")

Models loaded from /Users/maximeff-nectar/Documents/ledge-nectar/aqi/outputs/fitted_GLMMs.pkl.


In [12]:
# Initialize variables
best_aic = float('inf')
best_model = None
best_model_name = None

# Loop over the fitted models dict
for model_name, model in fitted_models.items():
    
    # Check if the model has warnings
    if len(model.warnings) == 0:
        # Get the AIC for the current model
        current_aic = model.AIC
        
        # Compare with the best AIC found so far
        if current_aic < best_aic:
            print(f"Current best model is {model_name} with an AIC of {current_aic}")
            best_aic = current_aic
            best_model = model
            best_model_name = model_name
    else:
        print(f"Model {model_name} did not converge due to warnings: {model.warnings}")

Current best model is model0 with an AIC of 142319.23490165427
Current best model is model1 with an AIC of 138707.492368243
Current best model is model3 with an AIC of 136911.57493729456
Current best model is model4 with an AIC of 136899.15826005692
      dtype='<U78'), array(['Model is nearly unidentifiable: very large eigenvalue\n - Rescale variables?'],
      dtype='<U75'), array(['Model is nearly unidentifiable: large eigenvalue ratio\n - Rescale variables?'],
      dtype='<U76')]
Current best model is model17 with an AIC of 131480.5015211164
Current best model is model18 with an AIC of 131428.72709803734
Current best model is model25 with an AIC of 129405.29656590715
Current best model is model26 with an AIC of 129319.57525936147
Current best model is model27 with an AIC of 129242.13863848915
      dtype='<U78'), array(['Model is nearly unidentifiable: very large eigenvalue\n - Rescale variables?'],
      dtype='<U75'), array(['Model is nearly unidentifiable: large eigenvalue rati

In [16]:
print(best_model.fit())

Linear mixed model fit by maximum likelihood  ['lmerMod']
Formula: death_next_season~ndvi_average*o3_average+wspd_average+(o3_average|operation_id)

Family: binomial	 Inference: parametric

Number of observations: 112626	 Groups: {'operation_id': 17.0}

Log-likelihood: -64613.069 	 AIC: 129242.139

Random effects:

                     Name     Var   Std
operation_id  (Intercept)  53.876  7.34
operation_id   o3_average   0.022  0.15

                      IV1         IV2   Corr
operation_id  (Intercept)  o3_average -0.983

Fixed effects:

                         Estimate  2.5_ci  97.5_ci     SE     OR  OR_2.5_ci  \
(Intercept)                -4.549  -7.755   -1.342  1.636  0.011      0.000   
ndvi_average               -2.675  -5.552    0.202  1.468  0.069      0.004   
o3_average                  0.150   0.084    0.215  0.034  1.162      1.088   
wspd_average               -0.192  -0.233   -0.151  0.021  0.826      0.792   
ndvi_average:o3_average    -0.294  -0.348   -0.240  0.027  0

Save the best model output

In [23]:
project_root = os.path.dirname(os.getcwd())
output_dir = os.path.join(project_root, 'outputs')    
filepath = os.path.join(output_dir, "GLMM_best_model.joblib")

save_model(best_model, filepath = filepath)