#### Author: MengChen Chung

## Spin up cluster

In [None]:
from dask_yarn import YarnCluster
from dask.distributed import Client

In [None]:
# Create a cluster where each worker has 1 cores and 4 GiB of memory:
cluster = YarnCluster(environment="/home/hadoop/environment.tar.gz",
                      worker_vcores = 1,
                      worker_memory = "4GiB"
                      )

# Scale cluster out to 8 such workers:
cluster.scale(8)

# Connect to the cluster (before proceeding, you should wait for workers to be registered by the dask scheduler, as below):
client = Client(cluster)

In [None]:
client

## Gather physical and mental data

In [3]:
# read in physical and mental score
import pandas as pd
physical_score = pd.read_csv('physical_score_18.csv', index_col=0)
mental_score = pd.read_csv('mental_score_18.csv', index_col=0)

In [200]:
physical_score

Unnamed: 0,State,Sum
0,MS,590.1
1,NC,455.4
2,OK,577.1
3,VA,420.8
4,WV,546.0
5,LA,540.1
6,MI,496.4
7,MA,367.6
8,ID,443.2
9,FL,400.4


In [183]:
mental_score

Unnamed: 0,State,Sum,NDVI,EVI
0,MS,4018.8,6712.272122,3789.306847
1,NC,28536.7,6608.098543,3764.396593
2,OK,8683.0,4870.77922,2875.587299
3,VA,17214.0,6633.109403,3865.553941
4,WV,11564.2,6308.458952,3762.958429
5,LA,14226.1,6340.5291,3542.86309
6,MI,32346.0,4875.348258,2983.861183
7,MA,28045.9,6225.011381,3542.72014
8,ID,2833.9,3420.613839,1854.747341
9,FL,62372.2,6295.864122,3518.862689


## Process geodata

In [1]:
# import modules
import dask
from dask import delayed
import rioxarray
import geopandas as gpd


def get_average_vegetation(tif_file, shape_file, state_idx):
    # read in tif file
    with rioxarray.open_rasterio(tif_file,
                                 chunks={'band': 1, 'x': 1024, 'y': 1024}) as xds:
        # clip the raster data based on the boundary
        # handle 'MultiPolygon' and 'Polygon' in shape file
        if shape_file.geometry[state_idx] != 'MultiPolygon':
            clipped = xds.rio.clip([shape_file.geometry[state_idx]])
        else:
            clipped = xds.rio.clip(shape_file.geometry[state_idx])
        # stack DataArray to be 1D
        clipped = clipped.squeeze().stack(z=("x", "y"))
        # filter out invalid index (-3000) then calculate the mean
        clipped = clipped[clipped!=-3000].mean()
    return clipped

In [2]:
# read in shape file
shape_file = gpd.read_file("cb_2018_us_state_500k/cb_2018_us_state_500k.shp")
# filter out non-state in the shape file
state_idx_list = [i for i in range(0, 56) if i not in (13,27,36,37,38,42,44,45)]
# creat NDVI and EVI lists
NDVI_file_list = ['NDVI/MOD13A1.006__500m_16_days_NDVI_doy2018{0:03d}_aid0001.tif'.format(i) for i in range(1,365,16)]
EVI_file_list = ['EVI/MOD13A1.006__500m_16_days_EVI_doy2018{0:03d}_aid0001.tif'.format(i) for i in range(1,365,16)]

In [5]:
# create monthly NDVI for physical and mental dataframes
# since there are 23 tif files instead of 24 for 1 year, I arbitrarily assign February to only have 1 tif file info
for month in range(1, 13):
    NDVI_list=[] # list to store NDVI for each state
    if month == 1:
        for i, tif_file in enumerate(NDVI_file_list[:month+1]):
            for j, state_idx in enumerate(state_idx_list):
                clipped = get_average_vegetation(tif_file, shape_file, state_idx)
                # first time we create the element
                # other times we update the list
                if i == 0:
                    NDVI_list.append(clipped)
                else:
                    NDVI_list[j] = NDVI_list[j] + clipped
        # compute, average, and assign the values
        NDVI_list = dask.compute(*NDVI_list)
        NDVI_list = [i.values.item(0)/2 for i in NDVI_list]
        physical_score['NDVI_1'] = NDVI_list
        mental_score['NDVI_1'] = NDVI_list
    elif month == 2:
        for tif_file in (NDVI_file_list[month:month+1]):
            for state_idx in state_idx_list:
                clipped = get_average_vegetation(tif_file, shape_file, state_idx)
                # first time we create the element
                NDVI_list.append(clipped)
        NDVI_list = dask.compute(*NDVI_list)
        NDVI_list = [i.values.item(0) for i in NDVI_list]
        physical_score['NDVI_2'] = NDVI_list
        mental_score['NDVI_2'] = NDVI_list
    else:
        for i, tif_file in enumerate(NDVI_file_list[2*month-3:2*month-1]):
            for j, state_idx in enumerate(state_idx_list):
                clipped = get_average_vegetation(tif_file, shape_file, state_idx)
                if i == 0:
                    NDVI_list.append(clipped)
                else:
                    NDVI_list[j] = NDVI_list[j] + clipped
        NDVI_list = dask.compute(*NDVI_list)
        NDVI_list = [i.values.item(0)/2 for i in NDVI_list]
        physical_score[f'NDVI_{month}'] = NDVI_list
        mental_score[f'NDVI_{month}'] = NDVI_list

In [8]:
# create monthly EVI for physical and mental dataframes
# since there are 23 tif files instead of 24 for 1 year, I arbitrarily assign February to only have 1 tif file info
for month in range(1, 13):
    EVI_list=[] # list to store EVI for each state
    if month == 1:
        for i, tif_file in enumerate(EVI_file_list[:month+1]):
            for j, state_idx in enumerate(state_idx_list):
                clipped = get_average_vegetation(tif_file, shape_file, state_idx)
                # first time we create the element
                # other times we update the list
                if i == 0:
                    EVI_list.append(clipped)
                else:
                    EVI_list[j] = EVI_list[j] + clipped
        # compute, average, and assign the values
        EVI_list = dask.compute(*EVI_list)
        EVI_list = [i.values.item(0)/2 for i in EVI_list]
        physical_score['EVI_1'] = EVI_list
        mental_score['EVI_1'] = EVI_list
    elif month == 2:
        for tif_file in (EVI_file_list[month:month+1]):
            for state_idx in state_idx_list:
                clipped = get_average_vegetation(tif_file, shape_file, state_idx)
                # first time we create the element
                EVI_list.append(clipped)
        EVI_list = dask.compute(*EVI_list)
        EVI_list = [i.values.item(0) for i in EVI_list]
        physical_score['EVI_2'] = EVI_list
        mental_score['EVI_2'] = EVI_list
    else:
        for i, tif_file in enumerate(EVI_file_list[2*month-3:2*month-1]):
            for j, state_idx in enumerate(state_idx_list):
                clipped = get_average_vegetation(tif_file, shape_file, state_idx)
                if i == 0:
                    EVI_list.append(clipped)
                else:
                    EVI_list[j] = EVI_list[j] + clipped
        EVI_list = dask.compute(*EVI_list)
        EVI_list = [i.values.item(0)/2 for i in EVI_list]
        physical_score[f'EVI_{month}'] = EVI_list
        mental_score[f'EVI_{month}'] = EVI_list

In [9]:
physical_score

Unnamed: 0,State,Sum,NDVI_1,NDVI_2,NDVI_3,NDVI_4,NDVI_5,NDVI_6,NDVI_7,NDVI_8,...,EVI_3,EVI_4,EVI_5,EVI_6,EVI_7,EVI_8,EVI_9,EVI_10,EVI_11,EVI_12
0,MS,590.1,5329.423124,5038.228774,5196.291953,6225.645306,7275.770719,7641.841195,8002.387424,7968.560167,...,2551.023667,3640.799663,4820.194897,5075.989117,5397.255092,5015.431545,4263.949955,3660.408483,3083.744183,2568.680673
1,NC,455.4,5189.439607,4962.665908,5370.986153,5642.429077,6994.295161,7754.311518,7765.78241,7988.100813,...,2523.807642,3068.694903,4539.343981,5125.662698,5196.340744,5220.437884,4703.816814,3897.017525,2927.589832,2538.400349
2,OK,577.1,3464.414815,3349.282693,3372.739407,3969.720516,5626.756544,6027.617658,5904.594269,6064.967971,...,1786.456842,2236.03757,3575.928981,3975.869826,3970.048943,3786.502928,3638.208829,3085.462418,2351.771847,1996.173156
3,VA,420.8,4906.00214,4471.055915,5040.752281,5239.114804,7108.171105,8164.145008,8159.80648,8329.274363,...,2331.774952,2739.928128,4693.202929,5726.685685,5854.824982,5526.267377,4966.309427,4058.544234,2814.768014,2415.628006
4,WV,546.0,3542.97603,2526.838177,4381.290126,4643.282872,6429.91902,8636.47943,8647.790446,8394.352141,...,1846.635359,2246.714398,3949.833778,6828.859912,6560.123654,5858.610124,5160.141487,4206.812733,2432.18649,1876.536236
5,LA,540.1,4961.101263,4552.627843,5106.118981,6276.646049,6845.427185,7137.266024,7234.882573,7253.951637,...,2479.945059,3589.550348,4286.469173,4727.999276,4773.216608,4597.018875,3980.514161,3597.60356,3054.15673,2421.133332
6,MI,496.4,2084.767475,941.478275,2628.991939,2683.459274,5282.576312,7306.585447,7846.653503,8154.078042,...,1403.243372,1457.859115,2952.934515,4917.622169,5339.097895,5458.228636,4683.373229,3201.610587,1611.567191,1713.357992
7,MA,367.6,4070.133122,4422.891957,3747.512077,4684.810555,6467.627877,7945.500953,8158.832349,8164.921374,...,1851.288803,2174.363827,3968.452566,5303.677557,5628.661794,5244.137011,4832.75034,4125.052044,2371.635269,2261.100286
8,ID,443.2,1756.92157,2408.625796,1470.517666,2713.313391,4218.588562,5361.61682,5223.230723,4627.982637,...,867.416601,1537.839271,2314.396262,3064.20578,3005.863776,2522.985055,2237.259906,1844.564592,1550.682022,758.750179
9,FL,400.4,5743.867876,5600.96964,5829.836013,6017.866621,6225.014306,6419.822597,6602.735937,6664.070034,...,3019.858028,3444.505199,3788.846992,3897.828849,4010.190307,3938.307,3798.913203,3665.323546,3409.893343,3129.358155


In [10]:
mental_score

Unnamed: 0,State,Sum,NDVI_1,NDVI_2,NDVI_3,NDVI_4,NDVI_5,NDVI_6,NDVI_7,NDVI_8,...,EVI_3,EVI_4,EVI_5,EVI_6,EVI_7,EVI_8,EVI_9,EVI_10,EVI_11,EVI_12
0,MS,4018.8,5329.423124,5038.228774,5196.291953,6225.645306,7275.770719,7641.841195,8002.387424,7968.560167,...,2551.023667,3640.799663,4820.194897,5075.989117,5397.255092,5015.431545,4263.949955,3660.408483,3083.744183,2568.680673
1,NC,28536.7,5189.439607,4962.665908,5370.986153,5642.429077,6994.295161,7754.311518,7765.78241,7988.100813,...,2523.807642,3068.694903,4539.343981,5125.662698,5196.340744,5220.437884,4703.816814,3897.017525,2927.589832,2538.400349
2,OK,8683.0,3464.414815,3349.282693,3372.739407,3969.720516,5626.756544,6027.617658,5904.594269,6064.967971,...,1786.456842,2236.03757,3575.928981,3975.869826,3970.048943,3786.502928,3638.208829,3085.462418,2351.771847,1996.173156
3,VA,17214.0,4906.00214,4471.055915,5040.752281,5239.114804,7108.171105,8164.145008,8159.80648,8329.274363,...,2331.774952,2739.928128,4693.202929,5726.685685,5854.824982,5526.267377,4966.309427,4058.544234,2814.768014,2415.628006
4,WV,11564.2,3542.97603,2526.838177,4381.290126,4643.282872,6429.91902,8636.47943,8647.790446,8394.352141,...,1846.635359,2246.714398,3949.833778,6828.859912,6560.123654,5858.610124,5160.141487,4206.812733,2432.18649,1876.536236
5,LA,14226.1,4961.101263,4552.627843,5106.118981,6276.646049,6845.427185,7137.266024,7234.882573,7253.951637,...,2479.945059,3589.550348,4286.469173,4727.999276,4773.216608,4597.018875,3980.514161,3597.60356,3054.15673,2421.133332
6,MI,32346.0,2084.767475,941.478275,2628.991939,2683.459274,5282.576312,7306.585447,7846.653503,8154.078042,...,1403.243372,1457.859115,2952.934515,4917.622169,5339.097895,5458.228636,4683.373229,3201.610587,1611.567191,1713.357992
7,MA,28045.9,4070.133122,4422.891957,3747.512077,4684.810555,6467.627877,7945.500953,8158.832349,8164.921374,...,1851.288803,2174.363827,3968.452566,5303.677557,5628.661794,5244.137011,4832.75034,4125.052044,2371.635269,2261.100286
8,ID,2833.9,1756.92157,2408.625796,1470.517666,2713.313391,4218.588562,5361.61682,5223.230723,4627.982637,...,867.416601,1537.839271,2314.396262,3064.20578,3005.863776,2522.985055,2237.259906,1844.564592,1550.682022,758.750179
9,FL,62372.2,5743.867876,5600.96964,5829.836013,6017.866621,6225.014306,6419.822597,6602.735937,6664.070034,...,3019.858028,3444.505199,3788.846992,3897.828849,4010.190307,3938.307,3798.913203,3665.323546,3409.893343,3129.358155


In [15]:
physical_score.to_csv('physical_df.csv')
mental_score.to_csv('mental_df.csv')

## Modeling

In [11]:
import statsmodels.api as sm

In [12]:
# formula = 'physical health compound score ~ vegetation access'

X_physical = physical_score[physical_score.columns.difference(['State','Sum'])]
y_physical = physical_score['Sum']
model_physical = sm.OLS(y_physical, X_physical)
reg_physical = model_physical.fit()
print(reg_physical.summary())

                                 OLS Regression Results                                
Dep. Variable:                    Sum   R-squared (uncentered):                   0.983
Model:                            OLS   Adj. R-squared (uncentered):              0.965
Method:                 Least Squares   F-statistic:                              56.20
Date:                Thu, 03 Jun 2021   Prob (F-statistic):                    9.22e-16
Time:                        23:10:30   Log-Likelihood:                         -264.90
No. Observations:                  48   AIC:                                      577.8
Df Residuals:                      24   BIC:                                      622.7
Df Model:                          24                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [14]:
# formula = 'mental health compound score ~ vegetation access'

X_mental = mental_score[mental_score.columns.difference(['State','Sum'])]
y_mental = mental_score['Sum']
model_mental = sm.OLS(y_mental, X_mental)
reg_mental = model_mental.fit()
print(reg_mental.summary())

                                 OLS Regression Results                                
Dep. Variable:                    Sum   R-squared (uncentered):                   0.805
Model:                            OLS   Adj. R-squared (uncentered):              0.610
Method:                 Least Squares   F-statistic:                              4.126
Date:                Thu, 03 Jun 2021   Prob (F-statistic):                    0.000471
Time:                        23:11:03   Log-Likelihood:                         -512.08
No. Observations:                  48   AIC:                                      1072.
Df Residuals:                      24   BIC:                                      1117.
Df Model:                          24                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------