# Get metrics: heatwave, polluted day and wildfire

In [57]:
import numpy as np
import pandas as pd

In [58]:
df = pd.read_parquet("outputs/esri_tmin_tmax_rhum_pm25_wf_smokePM_merged_clean.parquet")

In [59]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,tmax,tmin,pm25,wfday,GEOID,smoke_pm,rhum
time,FIPS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-01-01,1765.0,11.149988,8.649988,5.052,0.0,6001400000.0,0.0,100.0
2006-01-01,1766.0,13.35,8.749994,7.1,0.0,6001400000.0,0.0,100.0
2006-01-01,1767.0,13.35,8.749994,7.12,0.0,6001400000.0,0.0,100.0
2006-01-01,1768.0,13.35,8.749994,6.845,0.0,6001400000.0,0.0,100.0
2006-01-01,1769.0,13.35,8.749994,7.1,0.0,6001400000.0,0.0,100.0


## Dates

In [60]:
df = df.reset_index()

In [61]:
df.time.min()

Timestamp('2006-01-01 00:00:00')

In [62]:
df.time.max()

Timestamp('2020-12-31 00:00:00')

# Preprocessing

## Heat day rolling window of last 5 years

In [63]:
# df = df[df.FIPS.isin([1765.0,1766.0,1767.0])]

# df = df.head(30)

# df = df[['time','FIPS','tmax']]

In [64]:
def get_tmax95(x):
    # calculate rolling 95th percentile with a window size of 5 years (approx. 1825 days)
    rolling_95th = x.rolling(window=1825).quantile(0.95)
    
    # calculate 95th percentile for the first 5 years
    threshold = x[rolling_95th.isna()].quantile(0.95)
    # alternative: threshold = x[:window].quantile(0.95)
    rolling_95th = rolling_95th.fillna(threshold)
    
    return rolling_95th

df['tmax95'] = df.groupby('FIPS')['tmax'].transform(get_tmax95)

In [65]:
df['heatday'] = np.where(df['tmax'] > df['tmax95'], True, False)

In [66]:
df['heatday'].value_counts() # new without fillna

False    36812557
True      2105198
Name: heatday, dtype: int64

Note: there are less entries because we cropped years when we don't have wildfire and smoke PM2.5

In [67]:
df['heatday'].value_counts() # new with fillna

False    36812557
True      2105198
Name: heatday, dtype: int64

In [68]:
df['heatday'].value_counts() # old

False    36812557
True      2105198
Name: heatday, dtype: int64

## Cold day

In [69]:
def get_tmin05(x):
    rolling_5th = x.rolling(window=1825).quantile(0.05)
    threshold = x[rolling_5th.isna()].quantile(0.05)
    rolling_5th = rolling_5th.fillna(threshold)
    
    return rolling_5th

df['tmin05'] = df.groupby('FIPS')['tmin'].transform(get_tmin05)

In [70]:
df['coldday'] = np.where(df['tmin'] < df['tmin05'], True, False)

In [71]:
df.head()

Unnamed: 0,time,FIPS,tmax,tmin,pm25,wfday,GEOID,smoke_pm,rhum,tmax95,heatday,tmin05,coldday
0,2006-01-01,1765.0,11.149988,8.649988,5.052,0.0,6001400000.0,0.0,100.0,28.85,False,3.550012,False
1,2006-01-01,1766.0,13.35,8.749994,7.1,0.0,6001400000.0,0.0,100.0,26.535012,False,4.365001,False
2,2006-01-01,1767.0,13.35,8.749994,7.12,0.0,6001400000.0,0.0,100.0,26.535012,False,4.365001,False
3,2006-01-01,1768.0,13.35,8.749994,6.845,0.0,6001400000.0,0.0,100.0,26.535012,False,4.365001,False
4,2006-01-01,1769.0,13.35,8.749994,7.1,0.0,6001400000.0,0.0,100.0,26.535012,False,4.365001,False


In [72]:
df['coldday'].value_counts() 

False    37123222
True      1794533
Name: coldday, dtype: int64

## Polluted day

Polluted day is defined as day with PM2.5 equal or higher than 35 μg/m3

Ref: https://www.epa.gov/criteria-air-pollutants/naaqs-table

In [73]:
df['polluted'] = np.where(df['pm25'] >= 35, True, False)

In [74]:
df['polluted'].value_counts()

False    38493763
True       423992
Name: polluted, dtype: int64

## Wildfire day

In [75]:
df.head()

Unnamed: 0,time,FIPS,tmax,tmin,pm25,wfday,GEOID,smoke_pm,rhum,tmax95,heatday,tmin05,coldday,polluted
0,2006-01-01,1765.0,11.149988,8.649988,5.052,0.0,6001400000.0,0.0,100.0,28.85,False,3.550012,False,False
1,2006-01-01,1766.0,13.35,8.749994,7.1,0.0,6001400000.0,0.0,100.0,26.535012,False,4.365001,False,False
2,2006-01-01,1767.0,13.35,8.749994,7.12,0.0,6001400000.0,0.0,100.0,26.535012,False,4.365001,False,False
3,2006-01-01,1768.0,13.35,8.749994,6.845,0.0,6001400000.0,0.0,100.0,26.535012,False,4.365001,False,False
4,2006-01-01,1769.0,13.35,8.749994,7.1,0.0,6001400000.0,0.0,100.0,26.535012,False,4.365001,False,False


In [76]:
df['wfday'].value_counts()

0.0    38896336
1.0       21419
Name: wfday, dtype: int64

In [77]:
df['wfday'] = np.where(df['wfday'] == 1.0, True, False)

In [78]:
df['wfday'].value_counts()

False    38896336
True        21419
Name: wfday, dtype: int64

In [79]:
df = df.drop(columns=["tmax95", "tmin05"])

In [80]:
df.head()

Unnamed: 0,time,FIPS,tmax,tmin,pm25,wfday,GEOID,smoke_pm,rhum,heatday,coldday,polluted
0,2006-01-01,1765.0,11.149988,8.649988,5.052,False,6001400000.0,0.0,100.0,False,False,False
1,2006-01-01,1766.0,13.35,8.749994,7.1,False,6001400000.0,0.0,100.0,False,False,False
2,2006-01-01,1767.0,13.35,8.749994,7.12,False,6001400000.0,0.0,100.0,False,False,False
3,2006-01-01,1768.0,13.35,8.749994,6.845,False,6001400000.0,0.0,100.0,False,False,False
4,2006-01-01,1769.0,13.35,8.749994,7.1,False,6001400000.0,0.0,100.0,False,False,False


## Smoke Polluted Day

In [81]:
df['smoke_pm_non_zero'] = np.where(df['smoke_pm'] > 0.0, True, False)

In [82]:
# As seen in https://www.researchsquare.com/article/rs-2866201/v1

df['smoke_pm_gt_five'] = np.where(df['smoke_pm'] > 5.0, True, False)

In [83]:
df.head()

Unnamed: 0,time,FIPS,tmax,tmin,pm25,wfday,GEOID,smoke_pm,rhum,heatday,coldday,polluted,smoke_pm_non_zero,smoke_pm_gt_five
0,2006-01-01,1765.0,11.149988,8.649988,5.052,False,6001400000.0,0.0,100.0,False,False,False,False,False
1,2006-01-01,1766.0,13.35,8.749994,7.1,False,6001400000.0,0.0,100.0,False,False,False,False,False
2,2006-01-01,1767.0,13.35,8.749994,7.12,False,6001400000.0,0.0,100.0,False,False,False,False,False
3,2006-01-01,1768.0,13.35,8.749994,6.845,False,6001400000.0,0.0,100.0,False,False,False,False,False
4,2006-01-01,1769.0,13.35,8.749994,7.1,False,6001400000.0,0.0,100.0,False,False,False,False,False


In [84]:
df.to_parquet("outputs/merged_heatday_coldday_wfday_polluted_smoke_polluted_rolling.parquet")

# Scale temperature to Quantiles

In [4]:
df = pd.read_parquet("outputs/merged_heatday_coldday_wfday_polluted_smoke_polluted.parquet")

In [5]:
df = df.reset_index()

In [6]:
df = df[df['time'].dt.year == 2020]

In [7]:
from sklearn.preprocessing import StandardScaler #, QuantileTransformer

In [8]:
transformer = StandardScaler() #output_distribution='uniform')

In [9]:
df = df[['time', 'FIPS', 'tmax', 'tmin', 'wfday', 'smokePM_pred']]

In [10]:
# def scaler(x):
#     array_x = x.values.reshape(-1, 1)
#     res = transformer.fit_transform(array_x) 
#     res = list(res.flatten())
#     return pd.Series(res)

# df["scaled_tmax"] = df.groupby("FIPS")['tmax'].transform(scaler)
# df["scaled_tmin"] = df.groupby("FIPS")['tmax'].transform(scaler)

In [11]:
df[['scaled_tmax', 'scaled_tmin']] = df.groupby(
    'FIPS')[['tmin','tmax']].transform(
    lambda x: StandardScaler(
    ).fit_transform(x.values[:,np.newaxis]).ravel()
)

In [12]:
df.head()

Unnamed: 0,time,FIPS,tmax,tmin,wfday,smokePM_pred,scaled_tmax,scaled_tmin
45979974,2020-01-01,1765.0,16.550013,7.649988,False,0.0,-0.708617,-0.5691
45979975,2020-01-01,1766.0,17.749994,7.950006,False,0.0,-0.864803,-0.490781
45979976,2020-01-01,1767.0,17.749994,7.950006,False,0.0,-0.864803,-0.490781
45979977,2020-01-01,1768.0,17.749994,7.950006,False,0.0,-0.864803,-0.490781
45979978,2020-01-01,1769.0,17.749994,7.950006,False,0.0,-0.864803,-0.490781


In [13]:
crosswalk = pd.read_csv(
    "data/fips_crosswalk_merged_county.csv", 
    usecols=["FIPS", "long_FIPS", "TRACT_FIPS", "COUNTY_CODE"],
    index_col=["FIPS"],
    dtype={"COUNTY_CODE":str})

In [14]:
merged_df = df.merge(crosswalk, on="FIPS")

In [17]:
merged_df.to_csv("tmax_wfday_smoke-pm_scaled-2020.csv")