In [16]:
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm

In [17]:
tmax = xr.open_dataset('data/esri/MaxTemp_2006_2021_Cali.nc')

In [18]:
df_tmax = tmax[['FIPS','MAX_TEMPERATURE_NONE_SPATIAL_NEIGHBORS']].to_dataframe()

In [19]:
df_tmax = df_tmax.drop(columns=['lat', 'lon'])

In [20]:
df_tmax.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FIPS,MAX_TEMPERATURE_NONE_SPATIAL_NEIGHBORS
time,locations,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-12-31,0,1765.0,13.35
2005-12-31,1,1766.0,14.950006
2005-12-31,2,1767.0,14.950006
2005-12-31,3,1768.0,14.950006
2005-12-31,4,1769.0,14.950006


In [21]:
tmin = xr.open_dataset('data/esri/MinTemp_2006_2021_Cali.nc')

In [22]:
df_tmin = tmin[['FIPS','MIN_TEMPERATURE_NONE_SPATIAL_NEIGHBORS']].to_dataframe()

In [23]:
df_tmin = df_tmin.drop(columns=['lat', 'lon'])

In [24]:
df_tmin.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FIPS,MIN_TEMPERATURE_NONE_SPATIAL_NEIGHBORS
time,locations,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-12-31,0,1765.0,7.85
2005-12-31,1,1766.0,8.249994
2005-12-31,2,1767.0,8.249994
2005-12-31,3,1768.0,8.249994
2005-12-31,4,1769.0,8.249994


In [25]:
len(df_tmax)

52543404

In [26]:
len(df_tmin)

52543404

## Join tmin and tmax

In [27]:
tqdm.pandas()
df_t = df_tmax.join(df_tmin, rsuffix= "_tmin").progress_apply(lambda x: x)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.25it/s]


In [28]:
len(df_t)

52543404

In [29]:
df_t.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FIPS,MAX_TEMPERATURE_NONE_SPATIAL_NEIGHBORS,FIPS_tmin,MIN_TEMPERATURE_NONE_SPATIAL_NEIGHBORS
time,locations,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005-12-31,0,1765.0,13.35,1765.0,7.85
2005-12-31,1,1766.0,14.950006,1766.0,8.249994
2005-12-31,2,1767.0,14.950006,1767.0,8.249994
2005-12-31,3,1768.0,14.950006,1768.0,8.249994
2005-12-31,4,1769.0,14.950006,1769.0,8.249994


In [30]:
# check if FIPS are the same

df_t['FIPS'].equals(df_t['FIPS_tmin'])  # Returns True

True

In [31]:
df_t = df_t.drop(columns=['FIPS'])

In [32]:
import gc

del tmin
del tmax
del df_tmin
del df_tmax
gc.collect()

2314

## Load PM2.5

In [33]:
pm25 = xr.open_dataset('data/esri/PM25_Nature_2006_2020_Cali.nc')

In [34]:
df_pm25 = pm25[['FIPS','MEAN_NONE_SPATIAL_NEIGHBORS']].to_dataframe()

In [35]:
df_pm25 = df_pm25.drop(columns=['lat', 'lon'])

In [36]:
df_pm25.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FIPS,MEAN_NONE_SPATIAL_NEIGHBORS
time,locations,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-12-31,0,1765.0,4.986
2005-12-31,1,1766.0,5.5
2005-12-31,2,1767.0,5.486667
2005-12-31,3,1768.0,5.437778
2005-12-31,4,1769.0,5.48


In [37]:
len(df_pm25)

47929050

In [38]:
len(df_t)

52543404

## Test for joins

Look for fastest join

In [55]:
temp1 = df_t.head(30000)
temp2 = df_pm25.head(30000)

In [56]:
%%timeit 
temp1.join(temp2)

6.39 ms ± 79.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [57]:
%%timeit 
pd.merge(temp1, temp2, left_index=True, right_index=True, how='inner')

16.7 ms ± 19 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [58]:
%%timeit
pd.concat([temp1, temp2], axis=1)

34.6 ms ± 83.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Join between temperatures and PM25

In [48]:
df_t = df_t.rename(columns={"FIPS_tmin":"FIPS"})

In [53]:
len(df_t)

52543404

In [54]:
df_t = df_t.reset_index().set_index(["time","FIPS"])

In [55]:
df_t = df_t.drop(columns=['locations'])

In [56]:
df_t.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MAX_TEMPERATURE_NONE_SPATIAL_NEIGHBORS,MIN_TEMPERATURE_NONE_SPATIAL_NEIGHBORS
time,FIPS,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-12-31,1765.0,13.35,7.85
2005-12-31,1766.0,14.950006,8.249994
2005-12-31,1767.0,14.950006,8.249994
2005-12-31,1768.0,14.950006,8.249994
2005-12-31,1769.0,14.950006,8.249994


In [57]:
df_pm25 = df_pm25.reset_index().set_index(["time","FIPS"])

In [58]:
df_pm25 = df_pm25.drop(columns=['locations'])

In [59]:
df_pm25.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MEAN_NONE_SPATIAL_NEIGHBORS
time,FIPS,Unnamed: 2_level_1
2005-12-31,1765.0,4.986
2005-12-31,1766.0,5.5
2005-12-31,1767.0,5.486667
2005-12-31,1768.0,5.437778
2005-12-31,1769.0,5.48


In [60]:
df = df_t.join(df_pm25).progress_apply(lambda x: x)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  3.80it/s]


In [61]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MAX_TEMPERATURE_NONE_SPATIAL_NEIGHBORS,MIN_TEMPERATURE_NONE_SPATIAL_NEIGHBORS,MEAN_NONE_SPATIAL_NEIGHBORS
time,FIPS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-12-31,1765.0,13.35,7.85,4.986
2005-12-31,1766.0,14.950006,8.249994,5.5
2005-12-31,1767.0,14.950006,8.249994,5.486667
2005-12-31,1768.0,14.950006,8.249994,5.437778
2005-12-31,1769.0,14.950006,8.249994,5.48


In [62]:
df = df.rename(columns={"MAX_TEMPERATURE_NONE_SPATIAL_NEIGHBORS":"tmax",
"MIN_TEMPERATURE_NONE_SPATIAL_NEIGHBORS":"tmin",
"MEAN_NONE_SPATIAL_NEIGHBORS":"pm25"})

In [63]:
df.to_parquet("outputs/esri_tmin_tmax_pm25_merged.parquet")