# Clean Environment Data

I did not merge total green house gas emissions, total toxic chemicals released, or AQI

In [1]:
import pandas as pd

In [2]:
# Temperature
temperature = pd.read_csv("/data/explore/dataExtracts/data/Quarterly temperature min & max 2015-2024.csv", 
                     dtype={'year':str,'state_fips': str, 'census_region_code':str}, low_memory=False)

#Keep only "winter" quarter
temperature = temperature[temperature.quarter.str.contains("Winter")]

# Create GEO_ID
temperature.loc[(temperature["census_region_code"].isna()) 
                & (temperature["census_division_code"].isna()) 
                & (temperature["state_fips"].isna()), "GEO_ID"] = 999

temperature.loc[(temperature["census_region_code"].notnull())
                & (temperature["census_division_code"].isna()) 
                & (temperature["state_fips"].isna()), "GEO_ID"] = temperature["census_region_code"]

temperature.loc[(temperature["census_region_code"].notnull()) 
                & (temperature["census_division_code"].notnull()) 
                & (temperature["state_fips"].isna()), "GEO_ID"] = temperature["census_division_code"]

temperature.loc[(temperature["census_region_code"].notnull()) 
                & (temperature["census_division_code"].notnull()) 
                & (temperature["state_fips"].notnull()), "GEO_ID"] = temperature["state_fips"]

# Create area_type
temperature.loc[(temperature["census_region_code"].isna()) 
                & (temperature["census_division_code"].isna()) 
                & (temperature["state_fips"].isna()), "area_type"] = "national"

temperature.loc[(temperature["census_region_code"].notnull())
                & (temperature["census_division_code"].isna()) 
                & (temperature["state_fips"].isna()), "area_type"] = "region"

temperature.loc[(temperature["census_region_code"].notnull()) 
                & (temperature["census_division_code"].notnull()) 
                & (temperature["state_fips"].isna()), "area_type"] = "division"

temperature.loc[(temperature["census_region_code"].notnull()) 
                & (temperature["census_division_code"].notnull()) 
                & (temperature["state_fips"].notnull()), "area_type"] = "state"

#Drop extra variables
temperature=temperature.drop(columns=["census_division_code","census_region_code",'Unnamed: 0',
                                      'quarter','state_abbr','state_fips',"state_name"])

#Fix GEO_ID formatting for merge
temperature["GEO_ID"] = temperature["GEO_ID"].astype('float').astype('Int64').astype('str')
temperature.loc[(temperature["area_type"] == 'state') & (temperature["GEO_ID"].str.len() == 1), "GEO_ID"] = '0' + temperature["GEO_ID"]
temperature.loc[(temperature["area_type"] == 'national'), "GEO_ID"] = 'us'

# Make all column names uppercase
temperature.columns=map(str.upper, temperature.columns)

In [3]:
# Yearly rainfall
rain = pd.read_csv("/data/explore/dataExtracts/data/Yearly rainfall totals 2015-2024.csv", 
                     dtype={'year':str}, low_memory=False)

# Create GEO_ID
rain.loc[(rain["census_region_code"].isna()) 
                & (rain["census_division_code"].isna()) 
                & (rain["state_fips"].isna()), "GEO_ID"] = 999

rain.loc[(rain["census_region_code"].notnull())
                & (rain["census_division_code"].isna()) 
                & (rain["state_fips"].isna()), "GEO_ID"] = rain["census_region_code"]

rain.loc[(rain["census_region_code"].notnull()) 
                & (rain["census_division_code"].notnull()) 
                & (rain["state_fips"].isna()), "GEO_ID"] = rain["census_division_code"]

rain.loc[(rain["census_region_code"].notnull()) 
                & (rain["census_division_code"].notnull()) 
                & (rain["state_fips"].notnull()), "GEO_ID"] = rain["state_fips"]

# Create area_type
rain.loc[(rain["census_region_code"].isna()) 
                & (rain["census_division_code"].isna()) 
                & (rain["state_fips"].isna()), "area_type"] = "national"

rain.loc[(rain["census_region_code"].notnull())
                & (rain["census_division_code"].isna()) 
                & (rain["state_fips"].isna()), "area_type"] = "region"

rain.loc[(rain["census_region_code"].notnull()) 
                & (rain["census_division_code"].notnull()) 
                & (rain["state_fips"].isna()), "area_type"] = "division"

rain.loc[(rain["census_region_code"].notnull()) 
                & (rain["census_division_code"].notnull()) 
                & (rain["state_fips"].notnull()), "area_type"] = "state"

#Drop extra variables
rain=rain.drop(columns=["census_division_code","census_region_code",'Unnamed: 0',
                                      'state_abbr','state_fips',"state_name"])

#Fix GEO_ID formatting for merge
rain["GEO_ID"] = rain["GEO_ID"].astype('float').astype('Int64').astype('str')
rain.loc[(rain["area_type"] == 'state') & (rain["GEO_ID"].str.len() == 1), "GEO_ID"] = '0' + rain["GEO_ID"]
rain.loc[(rain["area_type"] == 'national'), "GEO_ID"] = 'us'

# Make all column names uppercase
rain.columns=map(str.upper, rain.columns)

In [4]:
# Energy Consumption
energy = pd.read_csv("/data/explore/dataExtracts/data/total_energy_consumption.csv", 
                       dtype={'year':str,'state_fip': str}, low_memory=False)

# Create GEO_ID
energy.loc[(energy["area_type"] == "national"), "GEO_ID"] = "us"
energy.loc[(energy["area_type"] == "state"), "GEO_ID"] = energy["state_fip"]

#Drop extra variables
energy=energy.drop(columns=["state_abbreviation","state_fip"])

# Make all column names uppercase
energy.columns=map(str.upper, energy.columns)

In [5]:
#to assign fips codes off of abbreviation
state_codes = {
    'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
    'FL': '12', 'WY': '56', 'PR': '72', 'NJ': '34', 'NM': '35', 'TX': '48',
    'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
    'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
    'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
    'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
    'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
    'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
    'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46', 'GU': '66', 'VI': '78'}

In [6]:
# Cold Related Deaths
cold = pd.read_csv("/data/explore/dataExtracts/data/Cold-Related Deaths 2015-2024.csv", 
                       dtype={'year':str,'county_code':str, 'census_region_code':str, 'census_division_code':str},
                   low_memory=False)

#map fips codes onto states
cold['state_fips'] = cold['state'].map(state_codes)

# Clean year
cold.loc[(cold["year"] == "2022 (provisional)"), "year"] = "2022"
cold.loc[(cold["year"] == "2023 (provisional)"), "year"] = "2023"

# Create GEO_ID
cold.loc[(cold["census_region_code"].isna()) 
                & (cold["census_division_code"].isna()) 
                & (cold["state"].isna()) 
                & (cold["county_code"].isna()), "GEO_ID"] = "us"

cold.loc[(cold["census_region_code"].notnull())
                & (cold["census_division_code"].isna()) 
                & (cold["state"].isna()) 
                & (cold["county_code"].isna()), "GEO_ID"] = cold["census_region_code"]

cold.loc[(cold["census_region_code"].notnull()) 
                & (cold["census_division_code"].notnull()) 
                & (cold["state"].isna()) 
                & (cold["county_code"].isna()), "GEO_ID"] = cold["census_division_code"]

cold.loc[(cold["census_region_code"].notnull()) 
                & (cold["census_division_code"].notnull()) 
                & (cold["state"].notnull())
                & (cold["county_code"].isna()), "GEO_ID"] = cold["state_fips"]

cold.loc[(cold["census_region_code"].notnull()) 
                & (cold["census_division_code"].notnull()) 
                & (cold["state"].notnull())
                & (cold["county_code"].notnull()), "GEO_ID"] = cold["county_code"]

# Create area_type
cold.loc[(cold["census_region_code"].isna()) 
                & (cold["census_division_code"].isna()) 
                & (cold["state"].isna()) 
                & (cold["county_code"].isna()), "area_type"] = "national"

cold.loc[(cold["census_region_code"].notnull())
                & (cold["census_division_code"].isna()) 
                & (cold["state"].isna()) 
                & (cold["county_code"].isna()), "area_type"] = "region"

cold.loc[(cold["census_region_code"].notnull()) 
                & (cold["census_division_code"].notnull()) 
                & (cold["state"].isna()) 
                & (cold["county_code"].isna()), "area_type"] = "division"

cold.loc[(cold["census_region_code"].notnull()) 
                & (cold["census_division_code"].notnull()) 
                & (cold["state"].notnull())
                & (cold["county_code"].isna()), "area_type"] = "state"

cold.loc[(cold["census_region_code"].notnull()) 
                & (cold["census_division_code"].notnull()) 
                & (cold["state"].notnull())
                & (cold["county_code"].notnull()), "area_type"] = "county"

#rename death variable
cold.rename(columns={'deaths':'cold_related_deaths'}, inplace=True)

#Drop extra variables
cold=cold.drop(columns=["census_division_code","census_region_code",'Unnamed: 0',
                                      'state','county_code',"county", "population",'state_fips'])

#drop duplicates
cold = cold.drop_duplicates()

# Make all column names uppercase
cold.columns=map(str.upper, cold.columns)

In [7]:
# Heat Related Deaths
heat = pd.read_csv("/data/explore/dataExtracts/data/Heat-Related Deaths 2015-2024.csv", 
                       dtype={'year':str,'county_code': str, 'census_region_code':str, 'census_division_code':str},
                   low_memory=False)

#map fips codes onto states
heat['state_fips'] = heat['state'].map(state_codes)

# Clean year
heat.loc[(heat["year"] == "2022 (provisional)"), "year"] = "2022"
heat.loc[(heat["year"] == "2023 (provisional)"), "year"] = "2023"
heat.loc[(heat["year"] == "2024 (provisional and partial)"), "year"] = "2024"

# Create GEO_ID
heat.loc[(heat["census_region_code"].isna()) 
                & (heat["census_division_code"].isna()) 
                & (heat["state"].isna()) 
                & (heat["county_code"].isna()), "GEO_ID"] = "us"

heat.loc[(heat["census_region_code"].notnull())
                & (heat["census_division_code"].isna()) 
                & (heat["state"].isna()) 
                & (heat["county_code"].isna()), "GEO_ID"] = heat["census_region_code"]

heat.loc[(heat["census_region_code"].notnull()) 
                & (heat["census_division_code"].notnull()) 
                & (heat["state"].isna()) 
                & (heat["county_code"].isna()), "GEO_ID"] = heat["census_division_code"]

heat.loc[(heat["census_region_code"].notnull()) 
                & (heat["census_division_code"].notnull()) 
                & (heat["state"].notnull())
                & (heat["county_code"].isna()), "GEO_ID"] = heat["state_fips"]

heat.loc[(heat["census_region_code"].notnull()) 
                & (heat["census_division_code"].notnull()) 
                & (heat["state"].notnull())
                & (heat["county_code"].notnull()), "GEO_ID"] = heat["county_code"]

# Create area_type
heat.loc[(heat["census_region_code"].isna()) 
                & (heat["census_division_code"].isna()) 
                & (heat["state"].isna()) 
                & (heat["county_code"].isna()), "area_type"] = "national"

heat.loc[(heat["census_region_code"].notnull())
                & (heat["census_division_code"].isna()) 
                & (heat["state"].isna()) 
                & (heat["county_code"].isna()), "area_type"] = "region"

heat.loc[(heat["census_region_code"].notnull()) 
                & (heat["census_division_code"].notnull()) 
                & (heat["state"].isna()) 
                & (heat["county_code"].isna()), "area_type"] = "division"

heat.loc[(heat["census_region_code"].notnull()) 
                & (heat["census_division_code"].notnull()) 
                & (heat["state"].notnull())
                & (heat["county_code"].isna()), "area_type"] = "state"

heat.loc[(heat["census_region_code"].notnull()) 
                & (heat["census_division_code"].notnull()) 
                & (heat["state"].notnull())
                & (heat["county_code"].notnull()), "area_type"] = "county"

#rename death variable
heat.rename(columns={'deaths':'heat_related_deaths'}, inplace=True)

#Drop extra variables
heat=heat.drop(columns=["census_division_code","census_region_code",'Unnamed: 0',
                                      'state','county_code',"county", "population", 'state_fips'])

#drop duplicates
heat = heat.drop_duplicates()

# Make all column names uppercase
heat.columns=map(str.upper, heat.columns)

In [8]:
# Asthma Rate
asthma = pd.read_excel("/data/explore/dataExtracts/data/asthma_rates.xlsx", sheet_name='Indicator', 
                     dtype={'Year':str,'StateFIPS':str, 'CountyFIPS':str})

#asthma = asthma.astype({'StateFIPS': str, 'CountyFIPS': str})

#Add missing 0s for FIPS codes
asthma.loc[(asthma['CountyFIPS'].str.len() == 4), 'CountyFIPS'] = '0' + asthma['CountyFIPS']
asthma.loc[(asthma['StateFIPS'].str.len() == 1), 'StateFIPS'] = '0' + asthma['StateFIPS']

# Create GEO_ID
asthma.loc[(asthma["area_type"] == "national"), "GEO_ID"] = "us"
asthma.loc[(asthma["area_type"] == "state"), "GEO_ID"] = asthma["StateFIPS"]
asthma.loc[(asthma["area_type"] == "county"), "GEO_ID"] = asthma["CountyFIPS"]

#Drop extra variables
asthma=asthma.drop(columns=["StateFIPS","CountyFIPS","geo_name"])

# Make all column names uppercase
asthma.columns=map(str.upper, asthma.columns)

In [9]:
#Mean Nitrate Concentration
nitrate = pd.read_excel("/data/explore/dataExtracts/data/mean_nitrate_con.xlsx", sheet_name='Indicator', 
                     dtype={'Year':str,'StateFIPS':str, 'CountyFIPS':str})

#Add missing 0s for FIPS codes
nitrate.loc[(nitrate['CountyFIPS'].str.len() == 4), 'CountyFIPS'] = '0' + nitrate['CountyFIPS']
nitrate.loc[(nitrate['StateFIPS'].str.len() == 1), 'StateFIPS'] = '0' + nitrate['StateFIPS']

# Create GEO_ID
nitrate.loc[(nitrate["area_type"] == "national"), "GEO_ID"] = "us"
nitrate.loc[(nitrate["area_type"] == "state"), "GEO_ID"] = nitrate["StateFIPS"]
nitrate.loc[(nitrate["area_type"] == "county"), "GEO_ID"] = nitrate["CountyFIPS"]

#Drop extra variables
nitrate=nitrate.drop(columns=["StateFIPS","CountyFIPS","geo_name"])

# Make all column names uppercase
nitrate.columns=map(str.upper, nitrate.columns)

In [10]:
#Merge
environment = pd.merge(temperature, rain, on=['GEO_ID','YEAR','AREA_TYPE'],
                     how='outer').merge(energy, on=['GEO_ID','YEAR','AREA_TYPE'],
                     how='outer').merge(cold, on=['GEO_ID','YEAR','AREA_TYPE'],
                     how='outer').merge(heat, on=['GEO_ID','YEAR','AREA_TYPE'],
                     how='outer').merge(asthma, on=['GEO_ID','YEAR','AREA_TYPE'],
                     how='outer').merge(nitrate, on=['GEO_ID','YEAR','AREA_TYPE'],
                     how='outer')

#Account for county fips changes
environment.loc[(environment["GEO_ID"] == '02270') & (environment["YEAR"] != '2015'), "GEO_ID"] = '02158'
environment.loc[(environment["GEO_ID"] == '46113') & (environment["YEAR"] != '2015'), "GEO_ID"] = '46102'

In [14]:
#Add Geo_names
geo = pd.read_csv("/data/discover/Data/General/general_measures.csv", 
                     dtype={'year':str,'state_fips': object,'county_fips':object,
                            'tract_fips':object,'GEO_ID':str,'area_type':str,'Geo_name':str}, low_memory=False)

#Drop MOE variables
geo=geo[['state_fips','county_fips','GEO_ID','area_type','Geo_name']]
#drop duplicates
geo = geo.drop_duplicates()
# Make all column names uppercase
geo.columns=map(str.upper, geo.columns)

#Merge
environment_merge = pd.merge(environment, geo, on=['GEO_ID','AREA_TYPE'],
                     how='left')

In [100]:
#Save in folder
environment_merge.to_csv("/data/discover/Data/Final Data/Environment and Natural Resources/environment.csv", 
                         header=True, index=False)

In [20]:
# A significant number of county fips codes are missing for AQI so did not include

# AQI
aqi = pd.read_excel("/data/explore/dataExtracts/data/AQI.xlsx", sheet_name='Indicator', 
                     dtype={'Year':str,'StateFIPS':str, 'CountyFIPS':str})

#Keep only US geographies
aqi = aqi[aqi.StateFIPS.notnull()]
aqi = aqi[~aqi.StateFIPS.str.contains("78")]

#Add missing 0s for FIPS codes
aqi.loc[(aqi['CountyFIPS'].str.len() == 4), 'CountyFIPS'] = '0' + aqi['CountyFIPS']
aqi.loc[(aqi['StateFIPS'].str.len() == 1), 'StateFIPS'] = '0' + aqi['StateFIPS']

# Create GEO_ID
aqi.loc[(aqi["area_type"] == "national"), "GEO_ID"] = "us"
aqi.loc[(aqi["area_type"] == "state"), "GEO_ID"] = aqi["StateFIPS"]
aqi.loc[(aqi["area_type"] == "county"), "GEO_ID"] = aqi["StateFIPS"] + aqi["CountyFIPS"]

#Drop extra variables
aqi=aqi.drop(columns=["StateFIPS","CountyFIPS", "geo_name"])

# Make all column names uppercase
aqi.columns=map(str.upper, aqi.columns)