# Attempting to map clustering for each individual burden

```python
CC = ['Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)', 'Expected building loss rate (Natural Hazards Risk Index) (percentile)', 'Expected population loss rate (Natural Hazards Risk Index) (percentile)', 'Share of properties at risk of flood in 30 years (percentile)', 'Share of properties at risk of fire in 30 years (percentile)']

energy = ['Energy burden (percentile)', 'PM2.5 in the air (percentile)']

housing = ['Housing burden (percent) (percentile)', 'Share of homes with no kitchen or indoor plumbing (percent) (percentile)', 'Percent pre-1960s housing (lead paint indicator) (percentile)']

health = ['Current asthma among adults aged greater than or equal to 18 years (percentile)', 'Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)', 'Coronary heart disease among adults aged greater than or equal to 18 years (percentile)', 'Low life expectancy (percentile)']

lp = ['Is there at least one abandoned mine in this census tract?', 'Is there at least one Formerly Used Defense Site (FUDS) in the tract?', 'Proximity to hazardous waste sites (percentile)', 'Proximity to NPL sites (percentile)', 'Proximity to Risk Management Plan (RMP) facilities (percentile)']

transport = ['Diesel particulate matter exposure (percentile)', 'DOT Travel Barriers Score (percentile)', 'Traffic proximity and volume (percentile)'] 

ww = ['Leaky underground storage tanks (percentile)', 'Wastewater discharge (percentile)']

wd = ['Linguistic isolation (percent) (percentile)', 'Low median household income as a percent of area median income (percentile)', 'Poverty (Less than 200% of federal poverty line) (percentile)', 'Unemployment (percent) (percentile)']
```

In [2]:
# Load packages 
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns         
import numpy as np
import os
import libpysal as lps 
from libpysal.weights import W 
from esda.getisord import G_Local
from map_burden import map_burden
import pprint
from functools import reduce
from scipy.stats import zscore
from scipy import stats

# Remove max columns
pd.set_option('display.max_columns', None)

In [None]:
# Import data
base_dir = "/capstone/justice40"
# base_dir = "~/MEDS/justice40/data-exploration"

# Complete 2.0 USA file
# usa_v2 = pd.read_csv(os.path.join(base_dir, "data", "usa_v2.csv"))

# 2.0 communities files (from current CEJST website)
comm_v2 = pd.read_csv(os.path.join(base_dir, "data", "2.0-communities.csv"))

# Version 2.0 shapefile data
# v2 = pd.read_csv(os.path.join(base_dir, "data", "2.0-shapefile-codebook", "2.0-codebook.csv"))
v2_geo = gpd.read_file(os.path.join(base_dir, "data", "2.0-shapefile-codebook", "usa", "usa.shp"))

In [None]:
from map_burden import map_burden 

map_burden(burden='Climate Change', df=comm_v2, geo_df=v2_geo)

In [None]:
map_burden(burden='Energy', df=comm_v2, geo_df=v2_geo)

In [None]:
map_burden(burden='Health', df=comm_v2, geo_df=v2_geo)

In [None]:
map_burden(burden='Housing', df=comm_v2, geo_df=v2_geo)

In [None]:
map_burden(burden='Legacy Pollution', df=comm_v2, geo_df=v2_geo)

In [None]:
map_burden(burden='Transportation', df=comm_v2, geo_df=v2_geo)

In [None]:
map_burden(burden='Waste and Wastewater', df=comm_v2, geo_df=v2_geo)

In [None]:
map_burden(burden='Workforce Development', df=comm_v2, geo_df=v2_geo)

In [None]:
# Climate Change
cc = comm_states[['Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)', 
            'Expected building loss rate (Natural Hazards Risk Index) (percentile)', 
            'Expected population loss rate (Natural Hazards Risk Index) (percentile)', 
            'Share of properties at risk of flood in 30 years (percentile)', 
            'Share of properties at risk of fire in 30 years (percentile)',
            # 'GEOID10_TRACT',
            'State/Territory',
            'Census tract 2010 ID']]
cc =cc.rename(columns={
    'Expected agricultural loss rate (Natural Hazards Risk Index) (percentile)':'ag_loss', 
    'Expected building loss rate (Natural Hazards Risk Index) (percentile)':'building_loss', 
    'Expected population loss rate (Natural Hazards Risk Index) (percentile)':'population_loss', 
    'Share of properties at risk of flood in 30 years (percentile)':'flood_risk', 
    'Share of properties at risk of fire in 30 years (percentile)':'fire_risk',
    'Census tract 2010 ID':'tract_id',
    'State/Territory':'state'
})
cc['cc_mean'] = cc[['ag_loss', 'building_loss', 'population_loss', 'flood_risk', 'fire_risk']].mean(axis=1)

cc.head()

In [None]:
# Energy
energy = comm_states[['Energy burden (percentile)', 'PM2.5 in the air (percentile)', 'Census tract 2010 ID']]
energy = energy.rename(columns={
    'Energy burden (percentile)':'energy_burden', 
    'PM2.5 in the air (percentile)':'pm_25',
    'Census tract 2010 ID':'tract_id'
    })
energy['energy_mean'] = energy[['energy_burden', 'pm_25']].mean(axis=1)

energy.head()

In [None]:
# Housing
housing = comm_states[['Housing burden (percent) (percentile)', 
                'Share of homes with no kitchen or indoor plumbing (percentile)', 
                'Percent pre-1960s housing (lead paint indicator) (percentile)',
                'Census tract 2010 ID']]
housing = housing.rename(columns={
    'Housing burden (percent) (percentile)':'housing_burden',
    'Share of homes with no kitchen or indoor plumbing (percentile)':'no_plumbing', 
    'Percent pre-1960s housing (lead paint indicator) (percentile)':'lead_paint',
    'Census tract 2010 ID':'tract_id'
})
housing['housing_mean'] = housing[['housing_burden', 'no_plumbing', 'lead_paint']].mean(axis=1)

housing.head()

In [None]:
# Health
health = comm_states[['Current asthma among adults aged greater than or equal to 18 years (percentile)', 
                'Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)', 
                'Coronary heart disease among adults aged greater than or equal to 18 years (percentile)',
                'Low life expectancy (percentile)',
                'Census tract 2010 ID']]
health = health.rename(columns={
    'Current asthma among adults aged greater than or equal to 18 years (percentile)':'asthma', 
    'Diagnosed diabetes among adults aged greater than or equal to 18 years (percentile)':'diabetes', 
    'Coronary heart disease among adults aged greater than or equal to 18 years (percentile)':'heart_disease',
    'Low life expectancy (percentile)':'low_life_expectancy',
    'Census tract 2010 ID':'tract_id'
})
health['health_mean'] = health[['asthma', 'diabetes', 'heart_disease', 'low_life_expectancy']].mean(axis=1)

health.head()

In [None]:
# Legacy Pollution
lp = comm_states[['Is there at least one abandoned mine in this census tract?', 
            'Is there at least one Formerly Used Defense Site (FUDS) in the tract?', 
            'Proximity to hazardous waste sites (percentile)', 
            'Proximity to NPL (Superfund) sites (percentile)', 
            'Proximity to Risk Management Plan (RMP) facilities (percentile)',
            'Census tract 2010 ID']]
lp = lp.rename(columns={
    'Is there at least one abandoned mine in this census tract?':'abandoned_mines', 
    'Is there at least one Formerly Used Defense Site (FUDS) in the tract?':'defense_site', 
    'Proximity to hazardous waste sites (percentile)':'hazardous_waste', 
    'Proximity to NPL (Superfund) sites (percentile)':'superfund_sites', 
    'Proximity to Risk Management Plan (RMP) facilities (percentile)':'rmp_facilites',
    'Census tract 2010 ID':'tract_id'
})
lp['lp_mean'] = lp[['abandoned_mines', 'defense_site', 'hazardous_waste', 'superfund_sites', 'rmp_facilites']].mean(axis=1)

lp.head()

In [None]:
# Transportation
transport = comm_states[['Diesel particulate matter exposure (percentile)', 
                    'DOT Travel Barriers Score (percentile)', 
                    'Traffic proximity and volume (percentile)',
                    'Census tract 2010 ID']]
transport = transport.rename(columns={
    'Diesel particulate matter exposure (percentile)':'diesel_pm', 
    'DOT Travel Barriers Score (percentile)':'travel_barriers', 
    'Traffic proximity and volume (percentile)':'traffic_proximity',
    'Census tract 2010 ID':'tract_id'
})
transport['transport_mean'] = transport[['diesel_pm', 'travel_barriers', 'traffic_proximity']].mean(axis=1)

transport.head()

In [None]:
# Waste and Wastewater
ww = comm_states[['Leaky underground storage tanks (percentile)', 'Wastewater discharge (percentile)', 'Census tract 2010 ID']]
ww =ww.rename(columns={
    'Leaky underground storage tanks (percentile)':'leaky_storage_tanks', 
    'Wastewater discharge (percentile)':'wastewater_discharge',
    'Census tract 2010 ID':'tract_id'
})
ww['ww_mean'] = ww[['leaky_storage_tanks', 'wastewater_discharge']].mean(axis=1)

ww.head()

In [None]:
# Workforce Development
wd = comm_states[['Linguistic isolation (percent) (percentile)', 
            'Low median household income as a percent of area median income (percentile)', 
            'Percent of individuals below 200% Federal Poverty Line (percentile)', 
            'Unemployment (percent) (percentile)',
            'Census tract 2010 ID']]
wd = wd.rename(columns={
    'Linguistic isolation (percent) (percentile)':'ling_isolation', 
    'Low median household income as a percent of area median income (percentile)':'low_income', 
    'Percent of individuals below 200% Federal Poverty Line (percentile)':'poverty', 
    'Unemployment (percent) (percentile)':'unemployment',
    'Census tract 2010 ID':'tract_id'
})
wd['wd_mean'] = wd[['ling_isolation', 'low_income', 'poverty', 'unemployment']].mean(axis=1)

wd.head()

In [None]:
# Concat without ids
# dfs = [cc, health, housing, energy, lp, transport, ww, wd]
# complete = pd.concat(dfs, axis=1)
# complete.head()

In [None]:
# Merge using tract ID
# dfs = [cc, health, housing, energy, lp, transport, ww, wd]
# df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['tract_id'],
#                                             how='outer'), 
#                                             dfs)
# df_merged = df_merged.set_index('tract_id')
# df_merged.head()
# df_merged.shape

In [None]:
# Drop NAs from mean column
# cc_clean = cc_clean.dropna(subset=['cc_mean'])
# cc_clean.isna().sum()

In [None]:
# Get the appropriate data frame
burden_names = {
        'Climate Change': cc,
        'Energy': energy,
        'Health': health,
        'Housing': housing,
        'Legacy Pollution': lp,
        'Transportation': transport,
        'Waste and Wastewater': ww,
        'Workforce Development': wd
    }

    burden_names = {
        'Climate Change':'cc',
        'Energy':'energy',
        'Health':'health',
        'Housing':'housing',
        'Legacy Pollution':'lp',
        'Transportation':'transport',
        'Waste and Wastewater':'ww',
        'Workforce Development':'wd'
    }

data = burden_names['Energy']
data