
## DESCRIPTION

This notebook cleans the data processed in generate_data.ipynb

***
### SETUP

The script common_setup.py is called below.

You can modify the script at will if needed (e.g. add new packages, etc.)

In [1]:
from utils.common_setup import *
%matplotlib inline

### Load the preprocessed data

In [2]:
data = pd.read_csv('../data/preprocess.csv')

***
### CLEAN THE DATA

In [3]:
ONLY_WINTER = False
ONLY_INSEASON = False 
ACTIVE_ALL_AND_WINTER = True  # Blog post hypothesis 
if ONLY_WINTER or ONLY_INSEASON:
    assert ONLY_WINTER != ONLY_INSEASON, "choose one! or both False"
if ONLY_INSEASON or ACTIVE_ALL_AND_WINTER:
    assert ACTIVE_ALL_AND_WINTER != ONLY_INSEASON, "choose one! or both False"

# Reload and apply
data = pd.read_csv(os.path.join(ROOT_PATH, "preprocess.csv"))

### Remove bad data

Removing OPS for the 2023 season

In [4]:
data = data.loc[~((data['season']==2023) & (data['operation_id'].isin([153, 204, 219, 220, 224])))]
print(len(data))

121107


In [4]:
if ONLY_WINTER:
    # Exclude in-season deadout , we are just looking at winter morta 
    data['death_date'] = pd.to_datetime(data['death_date']).dt.date
    data['winter_deadout'] = data.apply(lambda x: x['death_date'] > date(int(x['season']),x['season_start_month'],x['season_start_day']),axis=1)
    data = data.loc[data['winter_deadout']].reset_index(drop=True)

if ONLY_INSEASON:
    data['death_date'] = pd.to_datetime(data['death_date']).dt.date
    data['winter_deadout'] = data.apply(lambda x: x['death_date'].month in [6,7,8],axis=1)
    data = data.loc[~data['winter_deadout'] | ~data['death_next_season']].reset_index(drop=True)

if ACTIVE_ALL_AND_WINTER:
    data['death_date'] = pd.to_datetime(data['death_date']).dt.date
    data['creation_date'] = pd.to_datetime(data['creation_date']).dt.date
    # This ensure the exposition rate is the full season (makes them comparable with the current approach)
    # data = data.loc[(data['creation_date'] <= data.apply(
    #     lambda x: date(int(x['season']),x['season_start_month']+1,x['season_start_day'])
    #                  ,axis=1)) \
    #         & (data['death_date'] > data['season'].apply(lambda x : date(int(x),END_SEASON_MONTH,END_SEASON_DAY)))]
    # data = data.reset_index(drop=True)

### Keep next year aside and exclude some abnormal data

In [5]:
next_year = data.loc[data['season']==2024].reset_index(drop=True)
if not ONLY_INSEASON:
    data = data.loc[data['season']!=2024].reset_index(drop=True)

In [7]:
len(data)

121107

In [6]:
data.season.value_counts()

2023    102475
2022     24403
2021      6706
Name: season, dtype: int64

Filling null values with zero

In [7]:
data.dropna(subset=['aqhi_average','ndvi_average'],axis=0,inplace=True)

data['operation_id'] = data['operation_id'].astype(str)
data['season'] = data['season'].astype(str)

In [8]:
data['wspd_average_og'] = data['wspd_average']
data['tavg_average_og'] = data['tavg_average']
data['prcp_average_og'] = data['prcp_average']
data['aqhi_average_og'] = data['aqhi_average']

data['wspd_average'] = np.log(data['wspd_average']+10e-5)
data['tavg_average'] = np.log(data['tavg_average']+10e-5)
data['prcp_average'] = np.log(data['prcp_average']+10e-5)
data['aqhi_average'] = np.log(data['aqhi_average']+10e-5)

In [11]:
model_formulas = [
    # model0
    "death_next_season ~ 1 + (1|region)",

    # model
    "death_next_season ~ aqhi_average * ndvi_average + (aqhi_average|region)",

    # model2
    "death_next_season ~ aqhi_average + ndvi_average + (aqhi_average|region)",

    # model4
    "death_next_season ~ aqhi_average + ndvi_average + tavg_average + prcp_average + wspd_average + (aqhi_average|region)",

    # model5
    "death_next_season ~ aqhi_average * wspd_average + ndvi_average + tavg_average + prcp_average + (aqhi_average|region)",

    # model6
    "death_next_season ~ aqhi_average * wspd_average * tavg_average + (aqhi_average|region)",

    # model7a
    "death_next_season ~ aqhi_average * wspd_average * ndvi_average + (ndvi_average|region)",

    # model7
    "death_next_season ~ aqhi_average * ndvi_average + wspd_average + (ndvi_average|region)",

    # model9
    "death_next_season ~ aqhi_skew * ndvi_average * wspd_average + (aqhi_skew|region)",

    # model10
    "death_next_season ~ aqhi_skew * ndvi_skew + (aqhi_skew|region)",

    # model11
    "death_next_season ~ aqhi_skew * ndvi_average + wspd_average + (aqhi_skew|region)",

    # model12
    "death_next_season ~ aqhi_max * ndvi_average + wspd_average + (ndvi_average|region)",

    # model14
    "death_next_season ~ aqhi_max * ndvi_average * wspd_average + (ndvi_average|region)",

    # model15
    "death_next_season ~ aqhi_average * ndvi_skew * wspd_average + (aqhi_average|region)",

    # model16
    "death_next_season ~ aqhi_average * ndvi_skew + wspd_average + (aqhi_average|region)",

    # model17
    "death_next_season ~ aqhi_average * ndvi_skew * wspd_average + (aqhi_average|operation_id)",

    # model18
    "death_next_season ~ aqhi_max * ndvi_average + aqhi_average * wspd_average + (aqhi_average|operation_id)",

    # model19
    "death_next_season ~ ndvi_average * aqhi_average + (aqhi_average|operation_id)",

    # model20
    "death_next_season ~ ndvi_average * aqhi_average * wspd_average + (aqhi_average|operation_id)",

    # model21
    "death_next_season ~ ndvi_average * aqhi_average + (1|operation_id)",

    # model22
    "death_next_season ~ ndvi_average * aqhi_average * wspd_average + (1|operation_id)",

    # model23
    "death_next_season ~ ndvi_average * aqhi_average + wspd_average + (1|operation_id)",

    # model24
    "death_next_season ~ ndvi_average + aqhi_average + wspd_average + (1|operation_id)",

    # model25
    "death_next_season ~ ndvi_average + aqhi_average + wspd_average + (aqhi_average|operation_id)",

    # model26
    "death_next_season ~ ndvi_average + aqhi_average * wspd_average + (aqhi_average|operation_id)",

    # model27
    "death_next_season ~ ndvi_average + o3_average + (o3_average|operation_id)",

    # model28
    "death_next_season ~ ndvi_average * o3_average + (o3_average|operation_id)",

    # model29
    "death_next_season ~ ndvi_average * o3_average + wspd_average + (o3_average|operation_id)",

    # model30
    "death_next_season ~ ndvi_average * o3_average * wspd_average + (o3_average|operation_id)",

    # model31
    "death_next_season ~ ndvi_average + aqhi_average + wspd_average + tavg_average + prcp_average + (aqhi_average|operation_id)",

    # model33
    "death_next_season ~ ndvi_average * aqhi_average + aqhi_max * wspd_average + tavg_average * prcp_average + (aqhi_average|operation_id)"
]


### Define a function to fit the models sequentially

In [13]:
# Function to fit the models and store them in a dictionary
def fit_models(data, formulas):
    models = {}
    for i, formula in enumerate(formulas):
        model_name = f"model{i}"  # Generate model names dynamically
        print(f"Fitting {model_name} with formula: {formula}")
        models[model_name] = Lmer(formula, data=data, family='binomial').fit()
    return models

### Fit the models

This takes some time as we fit 31 models. Total elapsed time is >40minutes.

Run once and save outputs so you can reimport in session.

In [14]:
fitted_models = fit_models(data, model_formulas)

Fitting model0 with formula: death_next_season ~ 1 + (1|region)
Linear mixed model fit by maximum likelihood  ['lmerMod']
Formula: death_next_season~1+(1|region)

Family: binomial	 Inference: parametric

Number of observations: 123896	 Groups: {'region': 10.0}

Log-likelihood: -78191.027 	 AIC: 156386.055

Random effects:

               Name    Var    Std
region  (Intercept)  1.025  1.013

No random effect correlations specified

Fixed effects:

Fitting model1 with formula: death_next_season ~ aqhi_average * ndvi_average + (aqhi_average|region)
Linear mixed model fit by maximum likelihood  ['lmerMod']
Formula: death_next_season~aqhi_average*ndvi_average+(aqhi_average|region)

Family: binomial	 Inference: parametric

Number of observations: 123896	 Groups: {'region': 10.0}

Log-likelihood: -76179.021 	 AIC: 152372.042

Random effects:

                Name    Var    Std
region   (Intercept)  5.241  2.289
region  aqhi_average  3.271  1.809

                IV1           IV2   Corr
regio

### Save the dictionary of fitted models to a file

In [18]:
def save_fitted_models(fitted_models, filename="fitted_models.pkl"):
    
    # Path to the 'outputs' folder
    project_root = os.path.dirname(os.getcwd())
    output_dir = os.path.join(project_root, 'outputs')
    
    filepath = os.path.join(output_dir, filename)

    # Save the models to the specified file
    with open(filepath, 'wb') as f:
        pickle.dump(fitted_models, f)
    print(f"Models saved to {filepath}.")

In [21]:
save_fitted_models(fitted_models, "fitted_GLMMs.pkl")

Models saved to /Users/maximeff-nectar/Documents/ledge-nectar/aqi/outputs/fitted_GLMMs.pkl.


In [23]:
import sys
print(f"Size of fitted_models: {sys.getsizeof(fitted_models)} bytes")

Size of fitted_models: 1176 bytes


In [None]:
# Access a specific model result:
print(fitted_models['model0'].summary())