In [1]:
import pandas as pd

## Task 1 - NRI Data Cleaning

In [None]:
# __1. Import the NRI data. Ensure that the [FIPS code]
# (https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code) 
# variable ('STCOFIPS') is correctly identified as a string / character variable. 
# Otherwise, the leading zeros will be removed.__

# Change the columns type while import the data to make sure the leading 0 are correctly included
NRI = pd.read_csv('NRI_Table_Counties.csv', dtype={'STCOFIPS': str})
NRI['STCOFIPS'].unique()

In [None]:
# __2. Subset the NRI data to include only the 5-digit state/county 
# FIPS code and all colums ending with '\_AFREQ' and '\_RISKR'. 
# Each of these columns represents a different hazard type.
NRI_Sub = NRI.filter(regex='(_AFREQ|_RISKR)$')
NRI_Sub = NRI_Sub.join(NRI[['STCOFIPS']]) ## Also include the 5-digit state/county FIPS Code 

# Make sure the unique value is correct
NRI_Sub['STCOFIPS'].nunique()

In [57]:
# __3. Create a table / dataframe that, for each hazard type, 
# shows the number of missing values in the '\_AFREQ' and '\_RISKR' columns.
NRI_Missing = NRI_Sub.melt(var_name='Column', value_name='Value')
NRI_Missing = NRI_Missing.groupby('Column')['Value'].apply(lambda x: x.isnull().sum()).reset_index()
NRI_Missing['Type'] = NRI_Missing['Column'].str[-5:]
NRI_Missing['Hazard_Type'] = NRI_Missing['Column'].str[:4] 
NRI_Missing = NRI_Missing[NRI_Missing['Column'] != 'STCOFIPS']
NRI_Missing = NRI_Missing.drop(columns={'Column'})

NRI_Missing = pd.pivot_table(NRI_Missing, values=['Value'],
                             index=['Hazard_Type'],
                             columns=['Type'],
                             aggfunc="sum",
                             fill_value=0).reset_index()

NRI_Missing.columns = NRI_Missing.columns.droplevel(0)
new_column_names = ['Hazard_Type', 'Missing_AFREQ', 'Missing_RISKR']
NRI_Missing.columns = new_column_names
print(NRI_Missing)

In [None]:
# __4. Show the cross-tabulation of the 'AVLN_AFREQ' and 'AVLN_RISKR' columns 
# (including missing values). What do you observe?_

cross_tab = pd.crosstab(NRI_Sub['AVLN_AFREQ'], NRI_Sub['AVLN_RISKR'], dropna=False)
NRI_Sub['AVLN_RISKR'].unique()
cross_tab
## Findings: As the AVLN_AFREQ frequency increases, the relatively risk also increases.

In [67]:
# __5. Assuming that a risk that is "not applicable" to a county has an annualized frequency of 0, 
# impute the relevant missing values in the '\_AFREQ' columns with 0.
AFREQ_col = NRI_Sub.filter(regex='_AFREQ$')
NRI_Sub[AFREQ_col.columns] = AFREQ_col.fillna(0)
NRI_Sub

## Task 2 - SVI Data Cleaning

In [None]:
# __1. Import the SVI data. Ensure that the FIPS code is correctly identified as a string / character variable.
#  Otherwise, the leading zeros will be removed.__

SVI = pd.read_csv('SVI_2022_US_county.csv', dtype={'FIPS':str})

# __1. Subset the SVI data to include only the following columns:__
# `ST, STATE, ST_ABBR, STCNTY, COUNTY, FIPS, LOCATION, AREA_SQMI, E_TOTPOP, EP_POV150, EP_UNEMP, EP_HBURD, EP_NOHSDP, 
# EP_UNINSUR, EP_AGE65, EP_AGE17, EP_DISABL, EP_SNGPNT, EP_LIMENG, EP_MINRTY, EP_MUNIT, EP_MOBILE, EP_CROWD, EP_NOVEH, 
# EP_GROUPQ, EP_NOINT, EP_AFAM, EP_HISP, EP_ASIAN, EP_AIAN, EP_NHPI, EP_TWOMORE, EP_OTHERRACE`

columns_to_use = [
    'ST', 'STATE', 'ST_ABBR', 'STCNTY', 'COUNTY', 'FIPS', 'LOCATION', 'AREA_SQMI', 'E_TOTPOP', 'EP_POV150', 
    'EP_UNEMP', 'EP_HBURD', 'EP_NOHSDP', 'EP_UNINSUR', 'EP_AGE65', 'EP_AGE17', 'EP_DISABL', 'EP_SNGPNT', 
    'EP_LIMENG', 'EP_MINRTY', 'EP_MUNIT', 'EP_MOBILE', 'EP_CROWD', 'EP_NOVEH', 'EP_GROUPQ', 'EP_NOINT', 
    'EP_AFAM', 'EP_HISP', 'EP_ASIAN', 'EP_AIAN', 'EP_NHPI', 'EP_TWOMORE', 'EP_OTHERRACE'
]
SVI_Sub = SVI[columns_to_use]
print(SVI_Sub.columns)

In [None]:
# __2. Create a table / dataframe that shows the number of missing values in each column.
# (Hint: if you wrote a function for Task 1, you can reuse it here.)

# Function to calculate the number of missing values in each column
def missing_val(table):
    missing = table.isnull().sum()
    return pd.DataFrame(missing, columns=['Missing_Val'])

# Applying the function to the loaded DataFrame
SVI_Missing = missing_val(SVI_Sub)

# Display the resulting DataFrame (Apparently there are no missing values)
SVI_Missing

# Check the original subset table to see if the function was written correctly
SVI_Sub.columns[SVI_Sub.isnull().any()]

## Task 3 - Data Merging

In [None]:
# __1. Identify any FIPS codes that are present in the NRI data but not in the SVI data and vice versa. /
# Describe any discrepancies and possible causes? What to these discrepancies, if any, 
# mean for interpreting results based on the merged dataset moving forward?

# First get the FIPS code from both dataset
NRI_FIPS = NRI['STCOFIPS']
SVI_FIPS = SVI['FIPS']

# FIPS codes that are in NRI but not in SVI
# FIPS codes that are in SVI but not in NRI
FIPS_NRI_NotSVI = NRI_FIPS[~NRI_FIPS.isin(SVI_FIPS)]
FIPS_SVI_NotNRI = SVI_FIPS[~SVI_FIPS.isin(NRI_FIPS)]

# See what are the missing gepgraphy in the SVI
FIPS_NRI_NotSVI_check = FIPS_NRI_NotSVI.to_list()
NRI[NRI['STCOFIPS'].isin(FIPS_NRI_NotSVI_check)]['STATE'].unique()

#############################
## Answer: Looks like one state Connecticut and other special terrotories like American Samoa, Guam ... are not included in the SVI dataset. From the website of NRI
## I can see the the NRI dataset includes American Samoa, Commonwealth of the Northern Mariana Islands, Guam, Puerto Rico, and the U.S. Virgin Islands.The missing county in connecticut includes
## all the normal county geo names we see in other dataset. However, the connecticut geo in the SVI dataset are specified as planning region (https://storymaps.arcgis.com/stories/23bc7986213547a79cb8a5dafa84d68d)
## It looks like the plannign regions in CT are being treated by tehe offical County equivalents by the Census Bureau due to the file changes by the CT state. 

# See what are the missing gepgraphy in the NRI
FIPS_SVI_NotNRI_check = FIPS_SVI_NotNRI.to_list()
SVI[SVI['FIPS'].isin(FIPS_SVI_NotNRI_check)]

#############################
## Answer: looks like NRI does not have special County geo like planning region that shows in the table in the connecticut. As stated above, SVI uses planning regiong for the County equivalent
## as apposed to the NRI uses of traditional county definition.

#############################
## Answer: These results are different might be due to the fact that data collected by the SVI better reflect the census bureau geo definition and the year of data collected. Howeverm for the state of connecticut,
## we might need to do some geoanalysis to map the traditional county to the planning region in order to generalize the analysis. Otherwise, the CT analysis will create discrepency since the planning region
## seems better reflect the devlopment and demographics of the population in the state, but not the traditional county definition.

In [None]:
# __2. Merge the NRI and SVI data on the FIPS code. Use an outer join to keep all counties in the final dataset.
Merge_SVI_NRI = pd.merge(NRI, SVI, left_on='STCOFIPS', right_on='FIPS', how= 'outer')

In [None]:
# __3. Create a table / dataframe that shows the number of missing values in each column of the merged dataset.
# Applying the previous created function
Merge_SVI_NRI_missing = missing_val(Merge_SVI_NRI)
Merge_SVI_NRI_missing

## Task 4 - Data Analysis

In [None]:
# __1. For each numerical variable in the merged dataset, plot a histogram showing the distribution of values.
# (Hint: write a function to make the histogram for a single variable, then use a loop or apply function to make the histograms for all numerical variables.)
import matplotlib.pyplot as plt

Merge_SVI_NRI

def his_SVI_NRI(df, columns):
    plt.figure(figsize=(8,5))
    plt.hist(df[columns].dropna(), bins=20, edgecolor='black')
    plt.title(f'Stats of the {columns}')
    plt.xlabel(columns)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

numerical_col = Merge_SVI_NRI.select_dtypes(include=['float64', 'int64']).columns

for col in numerical_col:
    his_SVI_NRI(Merge_SVI_NRI, col)

In [133]:
from nbconvert import HTMLExporter
import nbformat

# Load the uploaded Jupyter notebook file
notebook_filename = "C:/users/hma/code/Michael_Homework/Intro_ML_2024Fall/hw_progress/HW_02.ipynb"

# Read the notebook content
with open(notebook_filename, "r", encoding="utf-8") as f:
    notebook_content = nbformat.read(f, as_version=4)

# # Create an HTML exporter
html_exporter = HTMLExporter()
html_exporter.exclude_input = False  # Include code inputs in the exported HTML

# Export the notebook to HTML format
(body, resources) = html_exporter.from_notebook_node(notebook_content)

# Define the output HTML filename
output_html_filename = "HW_02.html"

# Save the HTML output to a file
with open(output_html_filename, "w", encoding="utf-8") as f:
    f.write(body)