In [118]:
#Import dependencies
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Clustering of US demographic data

In [119]:
#Import US Population Data
us_demo_df = pd.read_excel('../US_Population_Data/US_Population_Data_Summary.xlsx')

In [120]:
us_demo_df.head()

Unnamed: 0,State ID,State,Total population,Male population,Female Population,Male median age,Female median age,Median age (years),Sex ratio (males per 100 females),Population 16 years and over with earnings,Median earnings (dollars)
0,ST14,Alabama,4893186,2365734,2527452,37.7,40.6,39.2,93.6,2358550,32080
1,ST16,Alaska,736990,384653,352337,34.1,35.2,34.6,109.2,412342,40471
2,ST26,Arizona,7174064,3564979,3609085,36.7,39.3,37.9,98.8,3577957,34302
3,ST34,Arkansas,3011873,1478511,1533362,36.9,39.7,38.3,96.4,1455764,31343
4,ST3,California,39346023,19562882,19783141,35.6,37.9,36.7,98.9,20593361,38176


In [121]:
#Remove non-numeric columns
us_demo_df_numeric = us_demo_df.drop(columns='State ID')

#Make the State column the index
us_demo_df_numeric.set_index('State', inplace=True)

## K-means clustering

### Create elbow plot to determine optimal number of clusters

In [122]:
#Store values of K to plot
inertia = []
k = list(range(1, 11))

In [123]:
#Loop through the different values of k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(us_demo_df_numeric)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [124]:
#Define a dataframe to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

### Create function to create clusters and plot them

In [125]:
def get_clusters(k, data):
    #Create a copy of the DataFrame
    data = data.copy()
    
    #Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)
    
    #Fit the model
    model.fit(data)
    
    #Predict clusters
    predictions = model.predict(data)
    
    #Create return DataFrame with predicted clusters
    data['class'] = model.labels_
    
    return data

### Plot clusters for k=3

In [126]:
#Get the clusters for k=3
three_clusters = get_clusters(3, us_demo_df_numeric)

#Plot the clusters
three_clusters.hvplot.scatter(x='Total population', y='Median earnings (dollars)', by='class')

In [127]:
#Plot the 3D-scatter data
fig = px.scatter_3d(
    three_clusters,
    x='Total population',
    y='Median earnings (dollars)',
    z='Median age (years)',
    color='class',
    symbol='class',
    width=800
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

### Plot clusters for k=4

In [128]:
#Get the clusters for k=4
four_clusters = get_clusters(4, us_demo_df_numeric)

#Plot the clusters
four_clusters.hvplot.scatter(x='Total population', y='Median earnings (dollars)', by='class')

In [129]:
#Plot the 3D-scatter data
fig = px.scatter_3d(
    four_clusters,
    x='Total population',
    y='Median earnings (dollars)',
    z='Median age (years)',
    color='class',
    symbol='class',
    width=800
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

### Plot clusters for k=5

In [130]:
#Get the clusters for k=5
five_clusters = get_clusters(5, us_demo_df_numeric)

#Plot the clusters
five_clusters.hvplot.scatter(x='Total population', y='Median earnings (dollars)', by='class')

In [131]:
#Plot the 3D-scatter data
fig = px.scatter_3d(
    five_clusters,
    x='Total population',
    y='Median earnings (dollars)',
    z='Median age (years)',
    color='class',
    symbol='class',
    width=800
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

### Analysis:

#### Different k values seem to divide the data into groups depending almost entirely on the population size alone. The following Principal Component Analysis (PCA) will hopefully provide a better way to divide the data into groups. However, the large difference in population size between the states may still provide a large influence on the clustering.

## Principal Component Analysis (PCA)

In [132]:
#Standardize the data with StandardScaler()
us_demo_df_numeric_scaled = StandardScaler().fit_transform(us_demo_df_numeric)

In [133]:
#Initialize PCA model
pca = PCA(n_components=3)

In [134]:
#Get three principal components for the iris data
us_demo_df_pca = pca.fit_transform(us_demo_df_numeric_scaled)

In [135]:
#Transform PCA data to a DataFrame
df_us_demo_pca = pd.DataFrame(data=us_demo_df_pca, columns=["PC 1", "PC 2", "PC 3"], index=us_demo_df_numeric.index)
df_us_demo_pca.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,-0.547122,-0.670165,-1.24854
Alaska,-0.86356,4.331625,1.953608
Arizona,0.238754,0.576061,-0.353061
Arkansas,-0.922749,0.374945,-1.20437
California,9.064634,-0.638118,0.193311


In [136]:
#Fetch the explained variance
pca.explained_variance_ratio_

array([0.4613438 , 0.35897516, 0.11088435])

In [137]:
#The above output shows that the first pc has 0.44 variance, the second pc has 0.19 variance, and the third pc has 0.11 variance
#Together, they contain 93% of the information

### Create elbow plot to determine optimal number of clusters

In [138]:
#Store values of K to plot
inertia = []
k = list(range(1, 11))

In [139]:
#Loop through the different values of k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_us_demo_pca)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [140]:
#Define a dataframe to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

### Plot clusters for k=3

In [141]:
#Get the clusters for k=3
pca_three_clusters = get_clusters(3, df_us_demo_pca)

#Plot the 3D-scatter data
fig = px.scatter_3d(
    pca_three_clusters,
    x='PC 1',
    y='PC 2',
    z='PC 3',
    color='class',
    symbol='class',
    width=800
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

### Plot clusters for k=5

In [147]:
#Get the clusters for k=5
pca_five_clusters = get_clusters(5, df_us_demo_pca)

#Plot the 3D-scatter data
fig = px.scatter_3d(
    pca_five_clusters,
    x='PC 1',
    y='PC 2',
    z='PC 3',
    color='class',
    symbol='class',
    opacity=0.7,
    width=800
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
#Merge the PCA DataFrame with the original DataFrame
pca_merged = pd.concat([us_demo_df, pca_five_clusters], axis=1)

In [None]:
#Plot the clusters with the principal components to ensure merging was successful
fig = px.scatter_3d(
    pca_merged,
    x='PC 1',
    y='PC 2',
    z='PC 3',
    color='class',
    symbol='class',
    opacity=0.7,
    hover_name='State',
    width=800
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

### Analysis:

#### I am satisfied that PCA has provided a better way to divide the data into groups. The clusters are more evenly distributed and the groups are more distinct. The large difference in population size between the states still appears to affect the clustering, but it is not as large of an influence as it was with the k-means clustering. I believe that either k=3 or k=5 would be a good choice for the number of clusters with PCA clustering depending on the level of detail desired.