In [1]:
import pandas as pd

In [23]:
# ## Task 1 - NRI Data Cleaning

# __1. Import the NRI data. Ensure that the [FIPS code]
# (https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code) 
# variable ('STCOFIPS') is correctly identified as a string / character variable. 
# Otherwise, the leading zeros will be removed.__

# Change the columns type while import the data to make sure the leading 0 are correctly included
NRI = pd.read_csv('NRI_Table_Counties.csv', dtype={'STCOFIPS': str})
NRI['STCOFIPS'].unique()

array(['01001', '01003', '01005', ..., '78010', '78020', '78030'],
      dtype=object)

In [30]:
# __2. Subset the NRI data to include only the 5-digit state/county 
# FIPS code and all colums ending with '\_AFREQ' and '\_RISKR'. 
# Each of these columns represents a different hazard type.
NRI_Sub = NRI.filter(regex='(_AFREQ|_RISKR)$')
NRI_Sub = NRI_Sub.join(NRI[['STCOFIPS']]) ## Also include the 5-digit state/county FIPS Code 

# Make sure the unique value is correct
NRI_Sub['STCOFIPS'].nunique()

3231

In [57]:
# __3. Create a table / dataframe that, for each hazard type, 
# shows the number of missing values in the '\_AFREQ' and '\_RISKR' columns.
NRI_Missing = NRI_Sub.melt(var_name='Column', value_name='Value')
NRI_Missing = NRI_Missing.groupby('Column')['Value'].apply(lambda x: x.isnull().sum()).reset_index()
NRI_Missing['Type'] = NRI_Missing['Column'].str[-5:]
NRI_Missing['Hazard_Type'] = NRI_Missing['Column'].str[:4] 
NRI_Missing = NRI_Missing[NRI_Missing['Column'] != 'STCOFIPS']
NRI_Missing = NRI_Missing.drop(columns={'Column'})

NRI_Missing = pd.pivot_table(NRI_Missing, values=['Value'],
                             index=['Hazard_Type'],
                             columns=['Type'],
                             aggfunc="sum",
                             fill_value=0).reset_index()

NRI_Missing.columns = NRI_Missing.columns.droplevel(0)
new_column_names = ['Hazard_Type', 'Missing_AFREQ', 'Missing_RISKR']
NRI_Missing.columns = new_column_names
print(NRI_Missing)

In [59]:
# 4. Show the cross-tabulation of the 'AVLN_AFREQ' and 'AVLN_RISKR' columns 
# (including missing values). What do you observe?_

cross_tab = pd.crosstab(NRI_Sub['AVLN_AFREQ'], NRI_Sub['AVLN_RISKR'], dropna=False)
cross_tab

AVLN_RISKR,Not Applicable,Relatively High,Relatively Low,Relatively Moderate,Very High,Very Low
AVLN_AFREQ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.01,0,0,0,0,0,24
0.016667,0,0,0,0,0,55
0.033333,0,0,14,0,0,15
0.05,0,0,10,2,0,3
0.066667,0,0,12,4,0,0
0.083333,0,0,5,3,0,0
0.1,0,0,1,2,0,0
0.116667,0,0,2,4,0,0
0.133333,0,0,1,6,0,0
0.15,0,0,4,3,0,2


In [63]:
NRI_Sub['AVLN_RISKR'].unique()

array(['Not Applicable', 'Very Low', 'Relatively High',
       'Relatively Moderate', 'Relatively Low', 'Very High'], dtype=object)