# Runs!

This file:
- Calculates and exports five files
- US_Sovi_Score = a sovi analysis using the entire us outputs county score and rank
- FEMA_Region_Sovi_Score = a sovi analysis by fema region outputs county score and rank
- State_Sovi_Score = a sovi analysis by state for 10 states outputs county score and rank
- county_in_state_rank = a ranking of the counties of 10 states from the us, region level, and state level analysis
- variable_contributions = net contributions of each variable in each analysis above

In [1]:
# Import modules, define directories
%run 'spss_pca.ipynb'
%run 'drop1_place.ipynb'

import pandas as pd
import geopandas as gpd
from scipy.stats import spearmanr
from pyhere import here
import numpy as np
# from factor_analyzer import FactorAnalyzer

# import data_prep # testing
# from spss_pca import SPSS_PCA # testing
# from drop1_place import * # testing

pd.set_option("chained_assignment", None)

path = {
    "dscr": here("data", "scratch"),
    "drpub": here("data", "raw", "public"),
    "drpriv": here("data", "raw", "private"),
    "ddpub": here("data", "derived", "public"),
    "ddpriv": here("data", "derived", "private"),
    "rfig": here("results", "figures"),
    "roth": here("results", "other"),
    "rtab": here("results", "tables")
}

# data_prep.db1.copy() # testing
# counties['GEOID'] = counties.index.values 

skipping
skipping


In [2]:
# Import data
# counties = gpd.read_file(here(path["ddpub"], "counties.gpkg"))
counties = pd.read_csv(here(path["ddpub"], "counties.csv"), dtype = {'GEOID': object})
US_All = pd.read_csv(here("data", "raw", "public", "spielman", "output", "sovi_inputs.csv"))
# counties = pd.read_csv(here("data", "scratch", "counties_test.csv"))


In [3]:
# Select only the relevant columns

# Attribute name and expected influence on vulnerability
input_names = [['MEDAGE_ACS', 'pos', 'person', 'Median Age'],
               ['BLACK_ACS', 'pos', 'person', 'Pop African-American (%)'],
               ['QNATAM_ACS', 'pos', 'person', 'Pop Native American (%)'],
               ['QASIAN_ACS', 'pos', 'person', 'Pop Asian (%)'],
               ['QHISP_ACS', 'pos', 'person', 'Pop Hispanic (%)'],
               ['QAGEDEP_ACS', 'pos', 'person', 'Age Dependency (%)'],
               ['QPUNIT_ACS', 'pos', 'person', 'Persons Per Housing Unit'],
               ['PRENTER_ACS', 'pos', 'hu', 'Rental Housing (%)'],
               ['QNRRES_ACS', 'pos', 'person', 'Nursing Home Residents (%)'],
               ['QFEMALE_ACS', 'pos', 'person', 'Pop Female (%)'],
               ['QFHH_ACS', 'pos', 'hu', 'Female-Headed Households (%)'],
               ['QUNOCCHU_ACS', 'pos', 'hu', 'Vacant Housing (%)'],
               ['PERCAP_ALT', 'neg', 'person', 'Per-Capita Income'],
               ['QESL_ALT', 'pos', 'person', 'English as Second Language (%)'],
               ['QCVLUN', 'pos', 'person', 'Unemployment (%)'],
               ['QPOVTY', 'pos', 'person', 'Poverty (%)'],
               ['QMOHO', 'pos', 'hu', 'Mobile Homes (%)'],
               ['QED12LES_ALT', 'pos', 'person',
                   'Adults Completed <Grade 12 (%)'],
               ['QFEMLBR', 'pos', 'person', 'Female Employment (%)'],
               ['QEXTRCT_ALT', 'pos', 'person',
                   'Extractive Sector Employment (%)'],
               ['QSERV_ALT', 'pos', 'person', 'Service Sector Employment (%)'],
               ['QSSBEN', 'pos', 'hu', 'Social Security Income (%)'],
               ['QNOAUTO_ALT', 'pos', 'hu', 'No Automobile (%)'],
               ['QFAM', 'neg', 'person', 'Children in Married Families (%)'],
               ['QRICH200K', 'neg', 'hu', 'Annual Income >$200K (%)'],
               ['MDGRENT_ALT', 'neg', 'hu', 'Median Rent'],
               ['MHSEVAL_ALT', 'neg', 'hu', 'Median Home Value'],
               ['POPDENS', 'pos', 'person', 'Population Density']]

# Get attribute names
attr_names1 = [j[0] for j in input_names] + ['GEOID']
attr_names2 = [j[0] for j in input_names] + ['Geo_FIPS']

# Select only the columns needed to compute SoVI
counties = counties[attr_names1]
US_All = US_All[attr_names2]

counties["GEOID"] = "g" + counties["GEOID"]
counties['stateID'] = counties.GEOID.str.slice(0, 3, 1)
attr_names1.remove('GEOID')
counties = counties.set_index(counties["GEOID"]).sort_index()
# sovi_input = sovi_input.set_index(sovi_input["GEOID"]).drop(labels = "GEOID", axis = 1)

US_All['stateID'] = US_All.Geo_FIPS.str.slice(0, 3, 1)
attr_names2.remove('Geo_FIPS')
US_All = US_All.set_index(US_All["Geo_FIPS"]).sort_index()


In [4]:
%%script echo skipping
pd.options.display.max_rows = 150
missing = pd.DataFrame({"na": US_All.isna().sum()})
missing.loc[missing["na"] > 0]

skipping


In [5]:
%%script echo skipping
missing = pd.DataFrame({"na": counties.isna().sum()})
missing.loc[missing["na"] > 0]

skipping


In [6]:
%%script echo skipping

# Uncomment these two lines to do this check:

# counties = counties.drop(['GEOID'], axis=1, inplace=False)
# US_All = US_All.drop(['Geo_FIPS'], axis=1, inplace=False)

# Comment them again if you want to run the rest of the script

test = counties.merge(US_All, how = "inner", left_on = "GEOID", right_on = 'Geo_FIPS')

skipping


In [7]:
%%script echo skipping
test[["QFAM_x", "QFAM_y"]].loc[test.QFAM_x.isna()]

skipping


In [8]:
%%script echo skipping
# Check for missing data
for i in test.columns:
    x = test[i].isnull().sum()
    if x > 0:
        print(i, x)
        
# Check for infinities
counties_num = test.select_dtypes(include=['int64','float64'])
for i in counties_num.columns:
    xmin = counties_num[i].min()
    xmax = counties_num[i].max()
    if xmin == -np.inf:
        print(i, "contains a negative infinity")
    elif xmax == np.inf:
        print(i, "contains a positive infinity")

skipping


In [9]:
%%script echo skipping
counties.eq(US_All).sum()

skipping


In [10]:
%%script echo skipping
print(test[["POPDENS_x", "POPDENS_y"]].head(4))

counties[["POPDENS"]].round(0).eq(US_All[["POPDENS"]].round(0)).sum()
# Despite being approximately the same... I think it's throwing my results off
# approximately the same

skipping


Issue causing inconsistent results: the land area variable comes from different sources in the two analyses; while approximately equal there were slight differences. These differences propogate into the "POPDENS" variable, which in turn leads to slightly different principle components, which cause inconsistent rankings.

How to address:

Look through social explorer documentation and identify exactly where they got their data from. Get the exact same data, and then try to find an analog in ACS data if possible.

In [11]:
# Flipping Signs -- looks like this comes before calculating the Z-score

# To ensure that each variable contributes as expected to the final Sovi
# Index following Eric Tate (2012?) we flip the signs of the input data.
for name, sign, sample, hrname in input_names:
    if sign == 'neg':
        counties[name] = -counties[name].values
    elif sign == 'pos':
        pass
    else:
        print("problem in flipping signs")
        raise

In [12]:
# Build FEMA subRegions Dict values= state ID's
FEMA_subs = dict()
FEMA_subs['FEMA_1'] = ['g23g33g25', 'g50', 'g09', 'g44']
FEMA_subs['FEMA_2'] = ['g36', 'g34']
FEMA_subs['FEMA_3'] = ['g42', 'g10', 'g11', 'g24', 'g51', 'g54']
FEMA_subs['FEMA_4'] = ['g21', 'g47', 'g37', 'g28', 'g01', 'g13', 'g45', 'g12']
FEMA_subs['FEMA_5'] = ['g27', 'g55', 'g26', 'g17', 'g18', 'g39']
FEMA_subs['FEMA_6'] = ['g35', 'g48', 'g40', 'g05', 'g22']
FEMA_subs['FEMA_7'] = ['g31', 'g19', 'g20', 'g29']
FEMA_subs['FEMA_8'] = ['g30', 'g38', 'g56', 'g46', 'g49', 'g08']
FEMA_subs['FEMA_9'] = ['g06', 'g32', 'g04']
FEMA_subs['FEMA_10'] = ['g53', 'g41', 'g16']

In [13]:
####################################
# DataFrames to hold US, FEMA region, and state level results
####################################

# Dict to hold variable loadings
# key will be [USA, Fema_region, stateid] depending on level of analysis
varContrib = {}

# National Score
US_Sovi_Score = pd.DataFrame(index=counties.GEOID,
                             columns=['sovi', 'rank'])

# In the FEMA_Region_Sovi_Score data frame ranks are BY FEMA REGION.
# The data frame holds both the SOVI score and the county rank
# This means that there should be 10 counties with rank 1 (one for each
# FEMA Region)
FEMA_Region_Sovi_Score = pd.DataFrame(index=counties.GEOID,
                                      columns=['sovi', 'rank', 'fema_region'])

# Create New England conglomerate of states
# These are the FIPS codes for the states with the letter "g" appended
counties.loc[counties.stateID.isin(['g23', 'g33', 'g25']), 'stateID'] = 'g23g33g25'

# These are the states in the state level analysis
stateList = ['g23g33g25', 'g36', 'g51', 'g13',
             'g17', 'g48', 'g29', 'g46', 'g06', 'g16']

# In the State_Sovi_Score data frame ranks are BY STATE.
# The data frame holds both the SOVI score and the county rank
# This means that there should be 10 counties with rank 1 (one for each
# state in stateList)
State_Sovi_Score = pd.DataFrame(
    index=counties.index[counties.stateID.isin(stateList)],
    columns=['sovi', 'rank', 'state_id'])


In [14]:
#######################
# Compute National SoVI
#######################
# compute SoVI
inputData = counties.drop(['GEOID', 'stateID'], axis=1, inplace=False)
inputData_array = inputData.values  # Convert DataFrame to NumPy array
pca = SPSS_PCA(inputData_array, reduce=True, varimax=True)

# # FOR THE PACKAGE
# inputData_norm = (inputData - inputData.mean(axis=0)) / inputData.std(axis=0)

# # fit factor analyzer with principal components and varimax rotation
# fa = FactorAnalyzer(rotation="varimax", n_factors=28, method='principal')
# fa.fit(inputData_norm)

# # get the rotated factor pattern
# loadings = pd.DataFrame(fa.loadings_, index=inputData_norm.columns, columns=[f"Factor{i+1}" for i in range(28)])


# BACK TO TRAD
sovi_actual_us = pca.scores_rot.sum(1)
sovi_actual_us = pd.DataFrame(
    sovi_actual_us, index=counties.GEOID, columns=['sovi'])

# rank
sovi_actual_us['rank'] = sovi_actual_us.rank(
    method='average', ascending=False)
US_Sovi_Score.update(sovi_actual_us)

attrib_contribution_us = pca.weights_rot.sum(1)

# Generate dictionary for all net loadings by variable for US
varContrib['USA'] = zip(attr_names1, attrib_contribution_us.tolist())

# quick check of ranks max should equal number of counties in US
try:
    US_Sovi_Score['rank'].max() == len(counties)
except:
    print("error in ranking check")
    raise

# cleanup
del inputData
# del inputData_norm
del sovi_actual_us
del attrib_contribution_us

In [15]:
pd.DataFrame(pca.weights)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.021383,-0.137227,-0.075875,0.231525,-0.017448,0.067022,-0.051047,0.052309
1,0.05728,0.091504,-0.137632,0.024604,-0.081076,0.166414,-0.18152,-0.135693
2,0.019667,0.036169,0.025718,-0.020071,0.092454,-0.599803,-0.010716,0.300664
3,-0.082364,0.070863,0.003592,0.152712,0.053154,0.031671,-0.068794,-0.105764
4,-0.004152,0.063162,0.244533,0.123365,0.10306,0.195329,0.300371,0.100388
5,0.049233,-0.125634,-0.035984,0.172031,0.166851,0.095044,0.074983,0.2389
6,0.005883,0.111629,0.159538,-0.074656,-0.133696,-0.099928,0.01385,0.283031
7,-0.030663,0.119931,-0.050503,-0.089807,0.27448,0.056943,0.039185,-0.169887
8,0.018391,-0.077717,-0.044631,-0.014778,0.342136,0.060425,0.037383,-0.223547
9,-0.010905,0.011737,-0.161958,-0.077595,0.066364,0.251587,0.094554,0.657473


In [16]:
%%script echo skipping

# This is for using the package

# calculate eigenvectors and eigenvalues
eigenvalues, eigenvectors = np.linalg.eig(fa.corr_)
eigenvalues
# sort the eigenvalues and eigenvectors in descending order
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]

# convert to dataframes
eigenvalues_df = pd.DataFrame({'Eigenvalue': eigenvalues}, index=inputData_norm.columns)
eigenvalues_df['Proportion'] = eigenvalues_df['Eigenvalue'] / eigenvalues_df['Eigenvalue'].sum()
eigenvalues_df['Cumulative Proportion'] = eigenvalues_df['Proportion'].cumsum()

# display dataframes
print("Eigenvalues:")
display(eigenvalues_df.style.format({'Eigenvalue': '{:.4f}', 'Proportion': '{:.4f}', 'Cumulative Proportion': '{:.4f}'}))

skipping


In [17]:
######################
# FEMA REGION SOVI
######################
for i in FEMA_subs:

    # Subset FEMA subregion
    FEMARegionData = counties[counties['stateID'].isin(FEMA_subs[i])]

    # compute SoVI
    inputData = FEMARegionData.drop(
        ['GEOID', 'stateID'], axis=1, inplace=False)
    # pca = SPSS_PCA(inputData, reduce=True, varimax=True)
    
    #NEW
    inputData_array = inputData.values  # Convert DataFrame to NumPy array
    pca = SPSS_PCA(inputData_array, reduce=True, varimax=True)
    
    sovi_actual_fema = pca.scores_rot.sum(1)

    # load into df for merge
    sovi_actual_fema = pd.DataFrame(
        sovi_actual_fema, index=FEMARegionData.index, columns=['sovi'])
    # add fema region to df
    sovi_actual_fema['fema_region'] = i
    # rank
    sovi_actual_fema['rank'] = sovi_actual_fema['sovi'].rank(
        method='average', ascending=False)

    FEMA_Region_Sovi_Score.update(sovi_actual_fema)

    attrib_contribution_fema = pca.weights_rot.sum(1)

    # Write attribute contribution output
    # Generate dictionary for all net loadings by variable and region
    varContrib[i] = zip(attr_names1, attrib_contribution_fema.tolist())

# cleanup
del FEMARegionData
del inputData
del sovi_actual_fema
del attrib_contribution_fema


In [18]:
pd.DataFrame(pca.weights)

Unnamed: 0,0,1,2,3,4,5,6
0,0.039141,-0.141368,-0.004473,0.182784,0.022983,0.045013,-0.013027
1,-0.096305,0.014893,0.04055,0.07546,-0.186681,0.256516,-0.085716
2,0.027965,-0.027305,0.124524,0.035601,-0.375882,-0.041309,-0.230695
3,-0.110619,0.011948,-0.00024,0.058619,-0.108585,0.222728,0.003812
4,0.017173,0.132772,0.073859,0.216721,0.127403,-0.074535,0.144827
5,0.071459,-0.107454,0.02415,0.124278,0.217567,0.064833,-0.06776
6,0.00517,0.140135,0.007041,-0.097381,-0.018434,-0.170466,-0.217839
7,-0.078058,0.054808,0.148964,-0.13402,0.097584,0.106389,0.195894
8,0.006243,-0.048844,0.082124,-0.00441,0.258264,0.357773,0.046734
9,-0.058937,-0.041232,0.055742,-0.080164,0.415614,-0.164649,-0.176633


In [19]:
#############################################
# State Analysis
#############################################
for st in stateList:
    # Subset FEMA subregion
    stateData = counties[counties.stateID == st]

    # compute SoVI
    inputData = stateData.drop(['GEOID', 'stateID'], axis=1, inplace=False)
    # pca = SPSS_PCA(inputData, reduce=True, varimax=True)
    
    # NEW:
    inputData_array = inputData.values  # Convert DataFrame to NumPy array
    pca = SPSS_PCA(inputData_array, reduce=True, varimax=True)
    
    sovi_actual = pca.scores_rot.sum(1)
    sovi_actual = pd.DataFrame(
        sovi_actual, index=stateData.index, columns=['sovi'])
    sovi_actual['state_id'] = st
    # rank w/in state
    sovi_actual['rank'] = sovi_actual['sovi'].rank(
        method='average', ascending=False)
    State_Sovi_Score.update(sovi_actual)
    attrib_contribution = pca.weights_rot.sum(1)
    varContrib[st] = zip(attr_names1, attrib_contribution.tolist())

# cleanup
del stateData
del inputData
del sovi_actual
del attrib_contribution

In [20]:
pd.DataFrame(pca.weights)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.055826,0.142423,-0.062299,0.097142,0.009424,-0.019503,0.086546,0.083678
1,-0.055752,-0.004592,0.070898,-0.050083,0.38369,-0.136172,-0.349197,0.124428
2,0.021549,0.052587,0.098739,0.004819,0.298201,0.12206,-0.077092,-0.53128
3,-0.11183,0.002837,0.063914,0.047797,0.064193,-0.270329,-0.239285,0.342498
4,0.050074,-0.132467,0.022844,0.226631,-0.00229,-0.033968,0.056487,-0.003788
5,0.100162,0.103402,0.022544,0.073429,-0.167622,0.075189,-0.089235,0.110589
6,-0.00792,-0.136536,0.051934,-0.139162,-0.032407,0.270002,0.007723,-0.162059
7,-0.070463,-0.060385,0.166028,0.006139,0.028213,-0.247902,0.250314,0.006728
8,0.00549,0.086761,0.075026,0.035333,-0.128862,-0.217133,-0.058944,-0.196075
9,-0.05324,-0.005044,0.128986,0.02029,-0.23361,0.307673,-0.027962,-0.066072


In [21]:
###################################################
# Make Var Contributions Data Frame
###################################################
variable_contributions = pd.DataFrame(index=attr_names1)
# for area in varContrib.iterkeys():
for area in varContrib.keys():
    variable_contributions[area] = [x for i, x in varContrib[area]]

In [22]:
##########################################################################
# Ranks w/ Geographic Extent
# For each county rank within state for US, state, and fema_region sovis
##########################################################################

county_in_state_rank = pd.DataFrame(index=State_Sovi_Score.index,
                                    columns=['state_sovi_rank', 'fema_region_sovi_rank', 'us_sovi_rank'])

for st in stateList:
    if st == 'g23g33g25':
        # get all counties in the three NE states and rank for us
        st_cty_scores1 = US_Sovi_Score.loc[['g23' in s for s in US_Sovi_Score.index], 'sovi']
        st_cty_scores2 = US_Sovi_Score.loc[['g33' in s for s in US_Sovi_Score.index], 'sovi']
        st_cty_scores3 = US_Sovi_Score.loc[['g25' in s for s in US_Sovi_Score.index], 'sovi']
        st_cty_scores = pd.concat([st_cty_scores1, st_cty_scores2, st_cty_scores3])

        county_in_state_rank.loc[st_cty_scores.index, 'us_sovi_rank'] = st_cty_scores.rank(method='average', ascending=False)

        # get all counties in state and rank for fema region
        st_cty_scores1 = FEMA_Region_Sovi_Score.loc[['g23' in s for s in FEMA_Region_Sovi_Score.index], 'sovi']
        st_cty_scores2 = FEMA_Region_Sovi_Score.loc[['g33' in s for s in FEMA_Region_Sovi_Score.index], 'sovi']
        st_cty_scores3 = FEMA_Region_Sovi_Score.loc[['g25' in s for s in FEMA_Region_Sovi_Score.index], 'sovi']
        st_cty_scores = pd.concat([st_cty_scores1, st_cty_scores2, st_cty_scores3])

        county_in_state_rank.loc[st_cty_scores.index, 'fema_region_sovi_rank'] = st_cty_scores.rank(method='average', ascending=False)

        # county rank in state only sovi
        county_in_state_rank.loc[st_cty_scores.index, 'state_sovi_rank'] = State_Sovi_Score.loc[State_Sovi_Score['state_id'] == 'g23g33g25', 'rank']

    else:
        st_cty_scores = US_Sovi_Score.loc[[st in s for s in US_Sovi_Score.index], 'sovi']
        county_in_state_rank.loc[st_cty_scores.index, 'us_sovi_rank'] = st_cty_scores.rank(method='average', ascending=False)
        # get all counties in state and rank for fema region
        st_cty_scores = FEMA_Region_Sovi_Score.loc[[st in s for s in FEMA_Region_Sovi_Score.index], 'sovi']
        county_in_state_rank.loc[st_cty_scores.index, 'fema_region_sovi_rank'] = st_cty_scores.rank(method='average', ascending=False)

        # county rank in state only sovi
        st_cty_scores = State_Sovi_Score.loc[State_Sovi_Score['state_id'] == st, 'rank']
        county_in_state_rank.loc[st_cty_scores.index, 'state_sovi_rank'] = st_cty_scores

In [23]:
county_in_state_rank

Unnamed: 0_level_0,state_sovi_rank,fema_region_sovi_rank,us_sovi_rank
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
g06001,48.0,54.0,51.0
g06003,58.0,4.0,36.0
g06005,20.0,15.0,38.0
g06007,26.0,23.0,24.0
g06009,52.0,38.0,37.0
...,...,...,...
g51800,121.0,105.0,96.0
g51810,118.0,112.0,108.0
g51820,74.0,67.0,25.0
g51830,93.0,92.0,69.0


START HERE

In [24]:
counties = counties.rename(columns={"GEOID": "Geo_FIPS"})

#####################################################
# Drop 1 Variable
#####################################################
USvarRanks = variable_contributions.USA.sort_values()
dropLevels = USvarRanks.index

#build multindex
geoLevels = counties.Geo_FIPS
geoLabels = []
for _ in range(len(dropLevels)):
    geoLabels.extend(range(len(geoLevels)))
dropLabels = np.repeat(range(len(dropLevels)), len(geoLevels))

US_Drop1_Multi_Index = pd.MultiIndex(levels=[dropLevels, geoLevels],
                                    codes=[dropLabels, geoLabels], # was labels but was getting error, looked at documentation and switched to a parameter that sounded similar
                                    names=['DroppedVar', 'Geo_FIPS'])

US_Drop1_NetContrib = pd.DataFrame(index=dropLevels, columns=dropLevels)

US_SoVI_Drop1_Score = pd.DataFrame(index=US_Drop1_Multi_Index, columns=['sovi'])


for j in dropLevels:
    US_dropj = counties.drop([j, 'Geo_FIPS', 'stateID'], axis=1, inplace=False)
    # pca = SPSS_PCA(US_dropj, reduce=True, varimax=True)
    # NEW
    US_dropj_array = US_dropj.values  # Convert DataFrame to NumPy array
    pca = SPSS_PCA(US_dropj_array, reduce=True, varimax=True)
    
    sovi_actual = pca.scores_rot.sum(1)
    sovi_actual = pd.DataFrame(sovi_actual, index=geoLevels, columns=['sovi'])
    US_SoVI_Drop1_Score.loc[j, 'sovi'] = sovi_actual.values
    attrib_contribution = pd.DataFrame(data=pca.weights_rot.sum(1), index=US_dropj.columns)

    attrib_contribution = attrib_contribution.transpose()
    attrib_contribution.index = [j]
    US_Drop1_NetContrib.loc[attrib_contribution.columns,j] = attrib_contribution.loc[j, :]


# sort by rank order
US_rank_order=abs(variable_contributions.USA).rank(method='average',ascending=False).sort_values().index # original rank order
US_Drop1_NetContrib=US_Drop1_NetContrib.loc[US_rank_order] # sort rows
US_Drop1_NetContrib=US_Drop1_NetContrib.loc[:,US_rank_order] # sort columns

# ranked version of the drop 1 variable table
US_Drop1_NetContrib_ranks=US_Drop1_NetContrib.copy()
US_Drop1_NetContrib_ranks=US_Drop1_NetContrib_ranks.apply(lambda x: abs(x).rank(method='average',ascending=False)) # convert absolute scores to ranks
US_Drop1_NetContrib_ranks=US_Drop1_NetContrib_ranks.loc[US_rank_order] # sort rows
US_Drop1_NetContrib_ranks=US_Drop1_NetContrib_ranks.loc[:,US_rank_order] # sort columns

######################
# CORRELATIONS
######################
state_corrs = pd.DataFrame(index = stateList, columns = ['spearman_r_st_fema', 'pvalue_st_fema', 'spearman_r_st_us', 'pvalue_st_us'])
for st in stateList:
  if st == 'g23g33g25':
    multi_state_data_tmp1 = county_in_state_rank.loc[['g23' in s for s in county_in_state_rank.index], ]
    multi_state_data_tmp2 = county_in_state_rank.loc[['g25' in s for s in county_in_state_rank.index], ]
    multi_state_data_tmp3 = county_in_state_rank.loc[['g33' in s for s in county_in_state_rank.index], ]
    multi_state_data_tmp = pd.concat([multi_state_data_tmp1, multi_state_data_tmp2, multi_state_data_tmp3])
    st_fema_spearman = spearmanr(multi_state_data_tmp[['state_sovi_rank', 'fema_region_sovi_rank']])
    st_us_spearman = spearmanr(multi_state_data_tmp[['state_sovi_rank', 'us_sovi_rank']])
    state_corrs.loc['g23g33g25', ] = [st_fema_spearman[0], st_fema_spearman[1], st_us_spearman[0], st_us_spearman[1]]
  else:
    st_fema_spearman = spearmanr(county_in_state_rank.loc[[st in s for s in county_in_state_rank.index], ['state_sovi_rank', 'fema_region_sovi_rank']])
    st_us_spearman = spearmanr(county_in_state_rank.loc[[st in s for s in county_in_state_rank.index], ['state_sovi_rank', 'us_sovi_rank']])
    state_corrs.loc[st, ] = [st_fema_spearman[0], st_fema_spearman[1], st_us_spearman[0], st_us_spearman[1]]

In [25]:
state_corrs

Unnamed: 0,spearman_r_st_fema,pvalue_st_fema,spearman_r_st_us,pvalue_st_us
g23g33g25,0.935272,0.0,0.745591,0.0
g36,0.60691,0.0,0.787716,0.0
g51,0.896188,0.0,0.675348,0.0
g13,0.796931,0.0,0.503329,0.0
g17,0.336492,0.000544,0.504419,0.0
g48,0.680187,0.0,0.615105,0.0
g29,0.816752,0.0,0.902691,0.0
g46,0.866486,0.0,0.605427,0.0
g06,0.690978,0.0,0.53133,1.8e-05
g16,0.880197,0.0,0.661311,1e-06


In [26]:
counties

Unnamed: 0_level_0,MEDAGE_ACS,BLACK_ACS,QNATAM_ACS,QASIAN_ACS,QHISP_ACS,QAGEDEP_ACS,QPUNIT_ACS,PRENTER_ACS,QNRRES_ACS,QFEMALE_ACS,...,QSERV_ALT,QSSBEN,QNOAUTO_ALT,QFAM,QRICH200K,MDGRENT_ALT,MHSEVAL_ALT,POPDENS,Geo_FIPS,stateID
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
g01001,37.0,0.179447,0.002876,0.008042,0.023997,0.187232,2.718521,0.201386,0.001942,0.513867,...,0.160870,0.292766,0.051370,-0.726847,-0.020116,-836,-137900.0,91.834949,g01001,g01
g01003,41.2,0.092356,0.005430,0.007352,0.043198,0.230693,2.484914,0.181393,0.003035,0.512787,...,0.178005,0.356091,0.030817,-0.751875,-0.034721,-874,-172900.0,115.252135,g01003,g01
g01005,38.2,0.459063,0.000910,0.002039,0.049692,0.203648,2.603099,0.264186,0.007499,0.463067,...,0.166612,0.371325,0.099756,-0.467056,-0.006686,-577,-88700.0,31.042768,g01005,g01
g01007,39.4,0.217137,0.003514,0.001098,0.018402,0.186789,2.948822,0.157066,0.003953,0.466204,...,0.134649,0.374763,0.051178,-0.629812,-0.003791,-581,-91600.0,36.571889,g01007,g01
g01009,39.1,0.012355,0.002071,0.001688,0.080848,0.211882,2.706148,0.167249,0.004020,0.504246,...,0.139538,0.370976,0.038324,-0.756860,-0.008701,-588,-115200.0,89.125526,g01009,g01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
g56037,32.9,0.004352,0.001732,0.005286,0.151606,0.166781,2.613124,0.254671,0.002620,0.478218,...,0.151213,0.203145,0.032235,-0.756602,-0.033928,-881,-174600.0,4.209406,g56037,g56
g56039,36.6,0.001876,0.003892,0.005439,0.145691,0.155819,2.838133,0.223540,0.002532,0.478571,...,0.228046,0.181384,0.023611,-0.820168,-0.084619,-1072,-692700.0,5.337666,g56039,g56
g56041,34.3,0.000860,0.002197,0.000430,0.086716,0.170184,2.854463,0.212153,0.004345,0.495416,...,0.185250,0.226790,0.028142,-0.794320,-0.031039,-634,-180200.0,10.062155,g56041,g56
g56043,41.6,0.003561,0.003561,0.002849,0.136973,0.241424,2.400116,0.275582,0.006766,0.495786,...,0.167287,0.339634,0.045944,-0.790675,-0.026461,-511,-157700.0,3.763599,g56043,g56


In [None]:
################
# DROP ONE PLACE
################

# df containing county names - no need for the geometr#
# county_names=pd.DataFrame(gpd.read_file(os.path.join(s#path,'USA_Counties_500k.shp('../data/
county_names = pd.read_csv(here(path["ddpub"], "counties.csv"), dtype = {'GEOID': object})
county_names = county_names[["GEOID", "NAME"]]
county_names["GEOID"] = "g" + county_names["GEOID"]
county_names
##### State (California)
print('\nDrop One Place: State\n')
# spearman rank correlations
ca_cors=dropCors(counties,State_Sovi_Score,'g06')

# # drop run with minimum rank correlation
cad=ca_cors[ca_cors==min(ca_cors)].index.values[0]

# rank change table with minimum rank correlation
ca_rchg=rankChgTable(inputs=counties,scores=State_Sovi_Score,obs_names=county_names,subset='g06',drop=cad,cor=True,top=10)

# rank quantile moves with minimum rank correlation
ca_quint_moves=rankQuantileMoves(inputs=US_All,scores=State_Sovi_Score,subset='g06',drop=cad)

# # ##### FEMA 9: California and surrounding states (includes Hawaii)
print('Drop One Place: FEMA\n')

f9_cors=dropCors(US_All,FEMA_Region_Sovi_Score,'FEMA_9')

# obs that decreases the correlation most when dropped
f9cd=f9_cors[f9_cors==min(f9_cors)].index.values[0]

f9_rchg=rankChgTable(inputs=counties,scores=FEMA_Region_Sovi_Score,obs_names=county_names,subset='FEMA_9',drop=f9cd,cor=True,top=10)

# rank quantile moves
f9_quint_moves=rankQuantileMoves(inputs=counties,scores=FEMA_Region_Sovi_Score,subset='FEMA_9',drop=f9cd)

# ### Full USA
print('Drop One Place: USA\n')

us_cors=dropCors(counties,US_Sovi_Score)

# obs that decreases the correlation most when dropped
uscd=us_cors[us_cors==min(us_cors)].index.values[0]

us_rchg=rankChgTable(inputs=counties,scores=US_Sovi_Score,obs_names=county_names,drop=uscd,cor=True,top=10)

# rank quantile moves
us_quint_moves=rankQuantileMoves(inputs=counties,scores=US_Sovi_Score,drop=uscd)
print('\n')

# cleanup
del multi_state_data_tmp



Drop One Place: State

Spearman Rank Correlation: 0.46137 
p-value: 0.0003


Quantiles

   Interval      Count
----------------------
[ 1.00, 13.20] |    12
(13.20, 24.40] |    11
(24.40, 35.60] |    11
(35.60, 46.80] |    11
(46.80, 58.00] |    12


Drop One Place: FEMA

Spearman Rank Correlation: 0.8595 
p-value: 0.0


Quantiles

   Interval      Count
----------------------
[ 1.00, 18.60] |    18
(18.60, 37.20] |    18
(37.20, 54.80] |    17
(54.80, 72.40] |    18
(72.40, 90.00] |    18


Drop One Place: USA



In [None]:
#####################################################
# OUTPUT TABLES
#####################################################
US_Sovi_Score.to_csv( here(path["ddpub"], 'US_Sovi_Score.csv') )

# In the FEMA_Region_Sovi_Score data frame ranks are BY FEMA REGION.
# The data frame holds both the SOVI score and the county rank
# This means that there should be 10 counties with rank 1 (one for each
# FEMA Region)
FEMA_Region_Sovi_Score.to_csv( here(path["ddpub"], 'FEMA_Region_Sovi_Score.csv') )

# In the State_Sovi_Score data frame ranks are BY STATE.
# The data frame holds both the SOVI score and the county rank
# This means that there should be 10 counties with rank 1 (one for each
# state in stateList)
State_Sovi_Score.to_csv( here(path["ddpub"], 'State_Sovi_Score.csv') )

# County rank within state for US, state, and fema_region sovis
county_in_state_rank.to_csv( here(path["ddpub"], 'County_in_State_Rank.csv') )

# Variable contributions for sovis at all geographic extents
variable_contributions.to_csv( here(path["ddpub"], 'variable_contributions.csv') )

# Net contribution of variables after dropping a variable
US_Drop1_NetContrib.to_csv( here(path["ddpub"], 'US_Drop1_NetContrib_raw.csv') )

# rank of variables after dropping a variable
US_Drop1_NetContrib_ranks.to_csv( here(path["ddpub"], 'US_Drop1_NetContrib_ranks.csv') )

# Correlation of ranks
state_corrs.to_csv( here(path["ddpub"], 'state_fema_us_rank_correlations.csv') )

# Drop 1 place
ca_rchg.to_csv( here(path["ddpub"], 'drop1_place_state_rank_change_ca.csv') )
ca_quint_moves.to_csv( here(path["ddpub"], 'drop1_place_state_quint_moves_ca.csv') )
f9_rchg.to_csv( here(path["ddpub"], 'drop1_place_fema_rank_change_fema9.csv') )
f9_quint_moves.to_csv( here(path["ddpub"], 'drop1_place_fema_quint_moves_fema9.csv') )
us_rchg.to_csv( here(path["ddpub"], 'drop1_place_usa_rank_change.csv') )
us_quint_moves.to_csv( here(path["ddpub"], 'drop1_place_usa_quint_moves.csv') )


Missing:
- county in region rank
- drop 1 place cors quintiles
- spearman x3
- us drop 1 net cont csv