
## DESCRIPTION

This notebook cleans the data processed in generate_data.ipynb

***
### SETUP

The script common_setup.py is called below.

You can modify the script at will if needed (e.g. add new packages, etc.)

In [1]:
from utils.common_setup import *

### Load the preprocessed data

In [2]:
data = pd.read_csv('../data/preprocess.csv')

***
### CLEAN THE DATA

In [3]:
ONLY_WINTER = False
ONLY_INSEASON = False 
ACTIVE_ALL_AND_WINTER = True  # Blog post hypothesis 
if ONLY_WINTER or ONLY_INSEASON:
    assert ONLY_WINTER != ONLY_INSEASON, "choose one! or both False"
if ONLY_INSEASON or ACTIVE_ALL_AND_WINTER:
    assert ACTIVE_ALL_AND_WINTER != ONLY_INSEASON, "choose one! or both False"

# Reload and apply
data = pd.read_csv(os.path.join(ROOT_PATH, "preprocess.csv"))

### Remove bad data

Removing OPS for the 2023 season

In [4]:
data = data.loc[~((data['season']==2023) & (data['operation_id'].isin([153, 204, 219, 220, 224])))]
print(len(data))

121107


In [5]:
if ONLY_WINTER:
    # Exclude in-season deadout , we are just looking at winter morta 
    data['death_date'] = pd.to_datetime(data['death_date']).dt.date
    data['winter_deadout'] = data.apply(lambda x: x['death_date'] > date(int(x['season']),x['season_start_month'],x['season_start_day']),axis=1)
    data = data.loc[data['winter_deadout']].reset_index(drop=True)

if ONLY_INSEASON:
    data['death_date'] = pd.to_datetime(data['death_date']).dt.date
    data['winter_deadout'] = data.apply(lambda x: x['death_date'].month in [6,7,8],axis=1)
    data = data.loc[~data['winter_deadout'] | ~data['death_next_season']].reset_index(drop=True)

if ACTIVE_ALL_AND_WINTER:
    data['death_date'] = pd.to_datetime(data['death_date']).dt.date
    data['creation_date'] = pd.to_datetime(data['creation_date']).dt.date
    # This ensure the exposition rate is the full season (makes them comparable with the current approach)
    # data = data.loc[(data['creation_date'] <= data.apply(
    #     lambda x: date(int(x['season']),x['season_start_month']+1,x['season_start_day'])
    #                  ,axis=1)) \
    #         & (data['death_date'] > data['season'].apply(lambda x : date(int(x),END_SEASON_MONTH,END_SEASON_DAY)))]
    # data = data.reset_index(drop=True)

### Keep next year aside and exclude some abnormal data

In [6]:
next_year = data.loc[data['season']==2024].reset_index(drop=True)
if not ONLY_INSEASON:
    data = data.loc[data['season']!=2024].reset_index(drop=True)

In [7]:
len(data)

121107

In [8]:
data.season.value_counts()

season
2023    89998
2022    24403
2021     6706
Name: count, dtype: int64

Filling null values with zero

In [9]:
data.dropna(subset=['aqhi_average','ndvi_average'],axis=0,inplace=True)

data['operation_id'] = data['operation_id'].astype(str)
data['season'] = data['season'].astype(str)

In [10]:
data['wspd_average_og'] = data['wspd_average']
data['tavg_average_og'] = data['tavg_average']
data['prcp_average_og'] = data['prcp_average']
data['aqhi_average_og'] = data['aqhi_average']

data['wspd_average'] = np.log(data['wspd_average']+10e-5)
data['tavg_average'] = np.log(data['tavg_average']+10e-5)
data['prcp_average'] = np.log(data['prcp_average']+10e-5)
data['aqhi_average'] = np.log(data['aqhi_average']+10e-5)

### Save the cleaned data

Cleaned data is saved to the data folder

In [13]:
data_folder = os.path.join(os.getcwd(), '..', 'data')
data.to_csv(os.path.join(data_folder, 'clean_data.csv'), index=False)