In [5]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

### K-Means Cluster Analysis of Healthcare Demand

In [26]:
places_df = pd.read_csv('./data/places_by_location_2023.csv')
shortage_tracts_gdf = gpd.read_file('./data/shortage_tracts_gdf_1pcp220.geojson')

# Filter unserved columns and Add unserved density columns
shortage_tracts_gdf = shortage_tracts_gdf[(shortage_tracts_gdf['unserved_medicaid'] > 0) | (shortage_tracts_gdf['unserved_commercial'] > 0)]
shortage_tracts_gdf['unserved_medicaid/km2'] = (shortage_tracts_gdf['unserved_medicaid'] / shortage_tracts_gdf['ALAND'] * 1000000).round(1)
shortage_tracts_gdf['unserved_commercial/km2'] = (shortage_tracts_gdf['unserved_commercial'] / shortage_tracts_gdf['ALAND'] * 1000000).round(1)

columns_to_keep = ['GEOID', 'geometry', 'unserved_medicaid', 'unserved_medicaid/km2', 'unserved_commercial', 'unserved_commercial/km2']
shortage_tracts_gdf = shortage_tracts_gdf[columns_to_keep]

print(shortage_tracts_gdf.shape)
display(shortage_tracts_gdf)
print(places_df.shape)
display(places_df)

(1171, 6)


Unnamed: 0,GEOID,geometry,unserved_medicaid,unserved_medicaid/km2,unserved_commercial,unserved_commercial/km2
14,36067015701,"POLYGON ((641562.710 1063730.806, 641666.321 1...",0,0.0,1895,21.9
52,36037950302,"POLYGON ((-2528.505 1145557.896, -1513.412 114...",0,0.0,1790,12.0
53,36037950301,"POLYGON ((-3571.220 1085257.391, -3558.696 108...",0,0.0,834,12.3
85,36075020706,"POLYGON ((612722.314 1194515.027, 612880.066 1...",29,4.7,0,0.0
87,36075020502,"POLYGON ((614092.534 1234876.130, 614481.509 1...",341,2.5,297,2.2
...,...,...,...,...,...,...
3518,36067000600,"POLYGON ((613288.484 1117850.742, 613405.775 1...",1998,3238.1,0,0.0
3519,36067011300,"POLYGON ((580008.418 1164822.985, 580016.609 1...",176,3.1,2612,46.2
3525,36067003900,"POLYGON ((609093.351 1105502.779, 609141.461 1...",2139,2319.4,0,0.0
3527,36067012800,"POLYGON ((581048.316 1138540.890, 581221.121 1...",271,20.5,0,0.0


(3427, 7)


Unnamed: 0,LocationID,Cholesterol screening among adults aged >=18 years,Current lack of health insurance among adults aged 18-64 years,Taking medicine for high blood pressure control among adults aged >=18 years with high blood pressure,Visits to dentist or dental clinic among adults aged >=18 years,Visits to doctor for routine checkup within the past year among adults aged >=18 years,Physical health not good for >=14 days among adults aged >=18 years
0,36001000100,83.3,10.1,76.3,50.6,79.5,13.3
1,36001000200,81.0,11.1,74.6,46.9,79.5,13.3
2,36001000300,83.5,8.2,77.4,55.8,78.6,12.2
3,36001000401,92.8,4.2,86.6,71.7,84.9,10.9
4,36001000403,86.0,4.4,74.9,70.3,76.5,8.0
...,...,...,...,...,...,...,...
3422,36119981000,70.7,16.4,61.0,45.0,68.9,11.7
3423,36119982000,78.8,19.0,63.4,46.1,68.7,11.5
3424,36119983000,83.6,11.0,65.1,59.1,74.6,11.0
3425,36119984000,96.4,5.9,92.2,64.8,89.6,17.1


In [35]:
# Join datasets for clustering analysis
merged_gdf = shortage_tracts_gdf.merge(places_df, how='left', left_on='GEOID', right_on='LocationID')
cluster_df = merged_gdf[['unserved_medicaid/km2', 'unserved_commercial/km2', 'Cholesterol screening among adults aged >=18 years', 'Taking medicine for high blood pressure control among adults aged >=18 years with high blood pressure', 'Visits to dentist or dental clinic among adults aged >=18 years', 'Visits to doctor for routine checkup within the past year among adults aged >=18 years', 'Physical health not good for >=14 days among adults aged >=18 years']]

cluster_df.to_csv('cluster_demand.csv', index=False)

In [30]:
# Scale our features data to range between 0 and 1
# To do so we use the MinMaxScaler
# Note that this outputs a numpy array
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Transform the data
X = scaler.fit_transform(cluster_df)
print(type(X))
display(pd.DataFrame(X,columns=cluster_df.columns))

NameError: name 'cluster_df' is not defined

In [31]:
# Apply the elbow method to consider appropriate cluster size
from sklearn.cluster import KMeans
inertias = []
for i in range(1,11):
    kmeans_elbow = KMeans(n_clusters=i, n_init="auto")
    kmeans_elbow.fit(X)
    inertias.append(kmeans_elbow.inertia_)
plt.plot(range(1,11), inertias, marker='o')
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show() 

NameError: name 'X' is not defined

In [32]:
# Conduct the K-means analysis
# First defining a variable to control number of k categories
n_k = 4
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=n_k, random_state=0, n_init="auto").fit(X)
# Display out cluster center means
display(pd.DataFrame(np.round(kmeans_model.cluster_centers_, decimals=4),columns=cluster_df.columns))

NameError: name 'X' is not defined

In [33]:
# Predict and visualize the clusters using pairs of features
# First make the predictions 
y_label = kmeans_model.fit_predict(X)
print(y_label)

# Then do some  looping to plot clusters for pairs of features 
columnx = 0
columny = 1
for i in range(0,n_k):
    plt.scatter(X[y_label==i, columnx], X[y_label==i, columny], s=2, label ='Cluster '+str(i))

plt.xlabel(cluster_df.columns[columnx])
plt.ylabel(cluster_df.columns[columny])
plt.legend(loc="lower right")
plt.show()

NameError: name 'kmeans_model' is not defined