# Space datasets description

In [3]:
import os
import numpy as np
from spacebench import SpaceEnv, SpaceDataset, DataMaster

In [4]:
space_master = DataMaster().master
space_master = space_master.reset_index()
space_master

Unnamed: 0,dataset,exposure,collection
0,healthd_dmgrcs_mortality_disc,binary,Air Pollution and Mortality
1,cdcsvi_limteng_hburdic_cont,continuous,Social Vulnerability and Welfare
2,climate_relhum_wfsmoke_cont,continuous,Heat Exposure and Wildfires
3,climate_wfsmoke_minrty_disc,binary,Heat Exposure and Wildfires
4,healthd_hhinco_mortality_cont,continuous,Air Pollution and Mortality
5,healthd_pollutn_mortality_cont,continuous,Air Pollution and Mortality
6,county_educatn_election_cont,continuous,Welfare and Elections
7,county_phyactiv_lifexpcy_cont,continuous,Welfare and Elections
8,county_dmgrcs_election_disc,binary,Welfare and Elections
9,cdcsvi_nohsdp_poverty_cont,continuous,Social Vulnerability and Welfare


In [5]:
collections = list(space_master.collection.unique())
collections

['Air Pollution and Mortality',
 'Social Vulnerability and Welfare',
 'Heat Exposure and Wildfires',
 'Welfare and Elections']

In [6]:
space_datasets = DataMaster().list_datasets()
space_datasets

['healthd_dmgrcs_mortality_disc',
 'cdcsvi_limteng_hburdic_cont',
 'climate_relhum_wfsmoke_cont',
 'climate_wfsmoke_minrty_disc',
 'healthd_hhinco_mortality_cont',
 'healthd_pollutn_mortality_cont',
 'county_educatn_election_cont',
 'county_phyactiv_lifexpcy_cont',
 'county_dmgrcs_election_disc',
 'cdcsvi_nohsdp_poverty_cont',
 'cdcsvi_nohsdp_poverty_disc']

In [7]:
os.system("mkdir downloads")
envname = space_datasets[0]
env = SpaceEnv(envname, dir="downloads")
env.__dict__.keys()

mkdir: downloads: File exists


dict_keys(['name', 'datamaster', 'api', 'dir', 'config', 'synthetic_data', 'metadata', 'graph', 'confounding_score_dict', 'smoothness_score_dict'])

In [32]:
for s in space_datasets:
    env = SpaceEnv(s, dir="downloads")

Downloaded: filename cdcsvi_limteng_hburdic_cont.zip, id 7140138, saved to downloads/cdcsvi_limteng_hburdic_cont.zip
Downloaded: filename climate_relhum_wfsmoke_cont.zip, id 7140139, saved to downloads/climate_relhum_wfsmoke_cont.zip
Downloaded: filename climate_wfsmoke_minrty_disc.zip, id 7140140, saved to downloads/climate_wfsmoke_minrty_disc.zip
Downloaded: filename healthd_hhinco_mortality_cont.zip, id 7140141, saved to downloads/healthd_hhinco_mortality_cont.zip
Downloaded: filename healthd_pollutn_mortality_cont.zip, id 7140142, saved to downloads/healthd_pollutn_mortality_cont.zip
Downloaded: filename county_educatn_election_cont.zip, id 7140143, saved to downloads/county_educatn_election_cont.zip
Downloaded: filename county_phyactiv_lifexpcy_cont.zip, id 7140144, saved to downloads/county_phyactiv_lifexpcy_cont.zip
Downloaded: filename county_dmgrcs_election_disc.zip, id 7140145, saved to downloads/county_dmgrcs_election_disc.zip
Downloaded: filename cdcsvi_nohsdp_poverty_cont.

## Summarize number of covariates

In [35]:
num_covars = dict()
for s in space_datasets:
    env = SpaceEnv(s, dir="downloads")
    num_covars[s] = len(env.metadata["covariates"])
num_covars

{'healthd_dmgrcs_mortality_disc': 31,
 'cdcsvi_limteng_hburdic_cont': 12,
 'climate_relhum_wfsmoke_cont': 10,
 'climate_wfsmoke_minrty_disc': 22,
 'healthd_hhinco_mortality_cont': 34,
 'healthd_pollutn_mortality_cont': 35,
 'county_educatn_election_cont': 40,
 'county_phyactiv_lifexpcy_cont': 41,
 'county_dmgrcs_election_disc': 41,
 'cdcsvi_nohsdp_poverty_cont': 13,
 'cdcsvi_nohsdp_poverty_disc': 10}

In [84]:
num_covars = dict()
for c in collections:
    environments = list(space_master.dataset[space_master.collection == c])
    covars = list()
    for e in environments:
        env = SpaceEnv(e, dir="downloads")
        covars.extend(env.metadata["covariates"])
    num_covars[c] = len(set(covars))
num_covars

{'Air Pollution and Mortality': 36,
 'Social Vulnerability and Welfare': 15,
 'Heat Exposure and Wildfires': 22,
 'Welfare and Elections': 47}

## Summarize graph nodes and edges

In [48]:
num_nodes = dict()
for s in space_datasets:
    env = SpaceEnv(s, dir="downloads")
    num_nodes[s] = {
        'nodes': env.graph.number_of_nodes(), #len(list(env.graph))
        'edges': env.graph.number_of_edges() 
    }
num_nodes

{'healthd_dmgrcs_mortality_disc': {'nodes': 3109, 'edges': 9237},
 'cdcsvi_limteng_hburdic_cont': {'nodes': 6828, 'edges': 21585},
 'climate_relhum_wfsmoke_cont': {'nodes': 8616, 'edges': 26695},
 'climate_wfsmoke_minrty_disc': {'nodes': 8616, 'edges': 26695},
 'healthd_hhinco_mortality_cont': {'nodes': 3109, 'edges': 9237},
 'healthd_pollutn_mortality_cont': {'nodes': 3109, 'edges': 9237},
 'county_educatn_election_cont': {'nodes': 3108, 'edges': 9236},
 'county_phyactiv_lifexpcy_cont': {'nodes': 3107, 'edges': 9231},
 'county_dmgrcs_election_disc': {'nodes': 3108, 'edges': 9236},
 'cdcsvi_nohsdp_poverty_cont': {'nodes': 6828, 'edges': 21585},
 'cdcsvi_nohsdp_poverty_disc': {'nodes': 6828, 'edges': 21585}}

##

## Summarize feature importance

In [20]:
env.metadata["feature_importance"]

{'EP_NOINT': '3.4662152161092536',
 'EP_NOHSDP': '1.8179462766183403',
 'EP_NOVEH': '1.7656865450233283',
 'EP_DISABL': '1.7584441492227767',
 'EP_AGE17': '1.6592130847478355',
 'EP_SNGPNT': '1.4202384892311593',
 'EP_LIMENG': '1.3135713769337727',
 'EP_MINRTY': '1.2188969691309663',
 'EP_AGE65': '1.0563060053448783',
 'EP_MUNIT': '0.8957315833727277',
 'EP_UNEMP': '0.7132018763230198'}

In [18]:
env.metadata["confounding_score"]

{'EP_LIMENG': '0.17563999999999996',
 'EP_NOINT': '0.06824',
 'EP_MUNIT': '0.04083999999999997',
 'EP_DISABL': '0.024839999999999973',
 'EP_MINRTY': '0.01651999999999998',
 'EP_NOVEH': '0.01499999999999999',
 'EP_AGE65': '0.012319999999999998',
 'EP_AGE17': '0.00895999999999999',
 'EP_UNEMP': '0.005159999999999987',
 'EP_SNGPNT': '0.004799999999999982'}

In [14]:
feat_imp = dict()
for s in space_datasets:
    env = SpaceEnv(s, dir="downloads")
    feat_imp[s] = {
        'min': min(env.metadata["feature_importance"].values()), #len(list(env.graph))
        'max': max(env.metadata["feature_importance"].values())
    }
feat_imp

{'healthd_dmgrcs_mortality_disc': {'min': '0.009747280607745434',
  'max': '1.323459376395186'},
 'cdcsvi_limteng_hburdic_cont': {'min': '0.43082654688695604',
  'max': '2.741491009718477'},
 'climate_relhum_wfsmoke_cont': {'min': '0.009770981345976515',
  'max': '0.04060395491102834'},
 'climate_wfsmoke_minrty_disc': {'min': '0.429989473182664',
  'max': '5.856974278382038'},
 'healthd_hhinco_mortality_cont': {'min': '0.03502318368504191',
  'max': '1.4301374881994822'},
 'healthd_pollutn_mortality_cont': {'min': '0.010562957613506719',
  'max': '1.3896818352086515'},
 'county_educatn_election_cont': {'min': '0.0012584064992189337',
  'max': '6.039701423477329'},
 'county_phyactiv_lifexpcy_cont': {'min': '0.001964198738300915',
  'max': '7.058624054794294e-05'},
 'county_dmgrcs_election_disc': {'min': '0.0010479852944912603',
  'max': '9.304917351094648'},
 'cdcsvi_nohsdp_poverty_cont': {'min': '0.3372125854480615',
  'max': '3.513431525919548'},
 'cdcsvi_nohsdp_poverty_disc': {'min':