In [106]:
import pandas as pd
import numpy as np
from sklearn.metrics import DistanceMetric

In [107]:
full_county_data =  pd.read_csv("county_health_data_2025.csv", skiprows=[0])

In [108]:
full_county_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3210 entries, 0 to 3209
Columns: 226 entries, FIPS to Unnamed: 225
dtypes: float64(217), int64(1), object(8)
memory usage: 5.5+ MB


In [109]:
# getting full list of column names since there are 226
print(list(sorted(full_county_data.columns)))

['# Associations', '# Completed High School', '# Dentists', '# Households with Broadband Access', '# Injury Deaths', '# Mental Health Providers', '# Primary Care Physicians', '# Some College', '# Unemployed', '# Uninsured', '# Workers who Drive Alone', '% Children in Poverty', '% Children in Poverty (AIAN)', '% Children in Poverty (Asian)', '% Children in Poverty (Black)', '% Children in Poverty (Hispanic)', '% Children in Poverty (White)', '% Completed High School', '% Drive Alone (AIAN)', '% Drive Alone (AIAN) 95% CI - High', '% Drive Alone (AIAN) 95% CI - Low', '% Drive Alone (Asian)', '% Drive Alone (Asian) 95% CI - High', '% Drive Alone (Asian) 95% CI - Low', '% Drive Alone (Black)', '% Drive Alone (Black) 95% CI - High', '% Drive Alone (Black) 95% CI - Low', '% Drive Alone (Hispanic)', '% Drive Alone (Hispanic) 95% CI - High', '% Drive Alone (Hispanic) 95% CI - Low', '% Drive Alone (White)', '% Drive Alone (White) 95% CI - High', '% Drive Alone (White) 95% CI - Low', '% Drive Alo

In [110]:
# Selecting Columns I want to study
county_data = full_county_data[["State",
                                "County",
                                "% Children in Poverty",
                                "% Uninsured",
                                "% Households with Broadband Access",
                                "% Severe Housing Problems",
                                "% Vaccinated",
                                "% Unemployed",
                                "% With Access to Exercise Opportunities",
                                "% with Annual Mammogram",]].copy()

In [111]:
county_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3210 entries, 0 to 3209
Data columns (total 10 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   State                                    3210 non-null   object 
 1   County                                   3159 non-null   object 
 2   % Children in Poverty                    3194 non-null   float64
 3   % Uninsured                              3194 non-null   float64
 4   % Households with Broadband Access       3195 non-null   float64
 5   % Severe Housing Problems                3195 non-null   float64
 6   % Vaccinated                             3176 non-null   float64
 7   % Unemployed                             3193 non-null   float64
 8   % With Access to Exercise Opportunities  3148 non-null   float64
 9   % with Annual Mammogram                  3172 non-null   float64
dtypes: float64(8), object(2)
memory usage: 250.9+ KB

In [112]:
county_data.describe()

Unnamed: 0,% Children in Poverty,% Uninsured,% Households with Broadband Access,% Severe Housing Problems,% Vaccinated,% Unemployed,% With Access to Exercise Opportunities,% with Annual Mammogram
count,3194.0,3194.0,3195.0,3195.0,3176.0,3193.0,3148.0,3172.0
mean,18.981215,10.446462,84.435681,12.856338,41.30762,3.586314,62.321792,42.861917
std,8.022613,4.645711,6.558144,4.43004,10.393706,1.215637,22.873056,8.278607
min,3.0,2.0,48.0,0.0,3.0,0.3,0.0,4.0
25%,13.0,7.0,81.0,10.0,35.0,2.8,48.0,38.0
50%,18.0,9.0,85.0,12.0,42.0,3.4,65.0,43.0
75%,23.0,13.0,89.0,15.0,49.0,4.1,80.0,49.0
max,63.0,38.0,100.0,61.0,69.0,17.3,100.0,67.0


In [113]:
# dropping rows that have null values
county_data = county_data.dropna()

In [114]:
county_data["State_and_County"] = county_data["State"] + ": " + county_data["County"]
county_data.set_index("State_and_County", inplace = True)

In [115]:
county_data.drop(columns=["State", "County"], inplace = True)

In [116]:
county_data.head()

Unnamed: 0_level_0,% Children in Poverty,% Uninsured,% Households with Broadband Access,% Severe Housing Problems,% Vaccinated,% Unemployed,% With Access to Exercise Opportunities,% with Annual Mammogram
State_and_County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama: Autauga,17.0,8.0,91.0,14.0,36.0,2.2,54.0,46.0
Alabama: Baldwin,14.0,10.0,90.0,12.0,44.0,2.3,62.0,45.0
Alabama: Barbour,35.0,12.0,72.0,14.0,39.0,4.4,55.0,47.0
Alabama: Bibb,21.0,11.0,81.0,12.0,31.0,2.5,43.0,36.0
Alabama: Blount,17.0,13.0,84.0,10.0,37.0,2.1,42.0,38.0


In [117]:
#l1 Normalization

sum_of_rows = county_data.sum(axis=1)
ignore_zeroes = sum_of_rows > 0 
county_data = county_data.loc[ignore_zeroes]
county_data_normalized = county_data.divide(sum_of_rows, axis = 0)

county_data_normalized.describe()

Unnamed: 0,% Children in Poverty,% Uninsured,% Households with Broadband Access,% Severe Housing Problems,% Vaccinated,% Unemployed,% With Access to Exercise Opportunities,% with Annual Mammogram
count,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0,3068.0
mean,0.070544,0.038495,0.308437,0.046296,0.148759,0.013179,0.218334,0.155956
std,0.033228,0.018367,0.033386,0.015134,0.030789,0.004816,0.066224,0.029071
min,0.009497,0.005954,0.184008,0.003615,0.009521,0.004441,0.0,0.012694
25%,0.046292,0.024564,0.287097,0.037082,0.131852,0.009901,0.184335,0.139255
50%,0.064813,0.034764,0.304029,0.044342,0.152981,0.012346,0.231374,0.155409
75%,0.088767,0.048479,0.32585,0.053005,0.169935,0.015366,0.266291,0.172635
max,0.273319,0.14356,0.492401,0.250245,0.23524,0.056666,0.359765,0.280269


In [118]:
def calculating_euclidean(county_of_interest):
    """
    Calculates Euclidean Distance with a county input
    """
    euclidean_metric = DistanceMetric.get_metric("euclidean")
    vectors = county_data_normalized.to_numpy()
    target_vector = county_data_normalized.loc[county_of_interest].to_numpy().reshape(1, -1)
    distances = euclidean_metric.pairwise(vectors,target_vector)[:,0]

    pair = [(county,distance) for county, distance in zip(county_data_normalized.index.tolist(), distances) if county != county_of_interest]
    top10 = sorted(pair, key = lambda x: x[1])[:10]

    print(f"Top 10 Counties most similar to {county_of_interest} using Euclidean Distance")

    for county, distance in top10:
        print(f"{county} = {distance}")

In [119]:
# prints all counties if needed to check index
# print(county_data_normalized.index.tolist())

['Alabama: Autauga', 'Alabama: Baldwin', 'Alabama: Barbour', 'Alabama: Bibb', 'Alabama: Blount', 'Alabama: Bullock', 'Alabama: Butler', 'Alabama: Calhoun', 'Alabama: Chambers', 'Alabama: Cherokee', 'Alabama: Chilton', 'Alabama: Choctaw', 'Alabama: Clarke', 'Alabama: Clay', 'Alabama: Cleburne', 'Alabama: Coffee', 'Alabama: Colbert', 'Alabama: Conecuh', 'Alabama: Coosa', 'Alabama: Covington', 'Alabama: Crenshaw', 'Alabama: Cullman', 'Alabama: Dale', 'Alabama: Dallas', 'Alabama: DeKalb', 'Alabama: Elmore', 'Alabama: Escambia', 'Alabama: Etowah', 'Alabama: Fayette', 'Alabama: Franklin', 'Alabama: Geneva', 'Alabama: Greene', 'Alabama: Hale', 'Alabama: Henry', 'Alabama: Houston', 'Alabama: Jackson', 'Alabama: Jefferson', 'Alabama: Lamar', 'Alabama: Lauderdale', 'Alabama: Lawrence', 'Alabama: Lee', 'Alabama: Limestone', 'Alabama: Lowndes', 'Alabama: Macon', 'Alabama: Madison', 'Alabama: Marengo', 'Alabama: Marion', 'Alabama: Marshall', 'Alabama: Mobile', 'Alabama: Monroe', 'Alabama: Montgomer

In [120]:
calculating_euclidean('California: Fresno')


Top 10 Counties most similar to California: Fresno using Euclidean Distance
California: Monterey = 0.02068469740132478
California: Merced = 0.02094067710710607
New Jersey: Cumberland = 0.022723714473926144
California: Sutter = 0.02804628238862843
Arizona: Coconino = 0.028173760423219677
California: Kern = 0.029495095797819924
New Jersey: Passaic = 0.030824014162568015
California: Butte = 0.03136914800515876
California: Tulare = 0.03156534887339214
Oregon: Klamath = 0.03159651908941623


In [121]:
calculating_euclidean("Louisiana: Franklin")

Top 10 Counties most similar to Louisiana: Franklin using Euclidean Distance
Alabama: Macon = 0.02737831681220443
Arkansas: Lee = 0.03170883406323991
Mississippi: Sunflower = 0.03418684183045277
Georgia: Ben Hill = 0.03516796394626635
South Carolina: Colleton = 0.04264116320586921
Florida: Dixie = 0.042725153992902665
Florida: Jefferson = 0.043422080364152536
Alabama: Dallas = 0.0441174573766443
Arkansas: Woodruff = 0.045461651966622885
Arkansas: Phillips = 0.045716147109737953


In [122]:
calculating_euclidean("North Carolina: Robeson")

Top 10 Counties most similar to North Carolina: Robeson using Euclidean Distance
Georgia: Brooks = 0.02413971770063914
Arkansas: Chicot = 0.02673647701559424
Georgia: Crisp = 0.029370109041347705
Georgia: Macon = 0.03006726192973584
South Carolina: Marion = 0.03119882761231191
Louisiana: West Carroll = 0.03227996357404534
South Carolina: Marlboro = 0.03290037042158673
Georgia: Ben Hill = 0.0356251605989668
Louisiana: Avoyelles = 0.03608188608220356
Georgia: Dooly = 0.038180409914752626
