In [139]:
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

# IMPORT, DROP NULL/DUPLICATE VALUES

In [140]:
file_path = "Master_data.csv"
M_df = pd.read_csv(file_path)
M_df.head(100)

Unnamed: 0,county,year,totalpopulation,unincorporatedpop,incorporatedpop,average_yearly_wage,lowtiermedianhomevalue,hightiermedianhomevalue,singlefamilymedianhomevalue,value_inches
0,Alachua,2019,267306,106299,161007,49116,123073.0,314621.0,208584.0,52.82
1,Alachua,2018,263291,104904,158387,47858,114833.0,300367.0,197879.0,66.46
2,Alachua,2017,260003,103810,156193,45705,107574.0,288320.0,187389.0,63.47
3,Alachua,2016,257062,102298,154764,44257,98319.0,268422.0,173570.0,46.39
4,Alachua,2015,254893,101621,153272,43598,92215.0,256906.0,164952.0,52.27
...,...,...,...,...,...,...,...,...,...,...
95,Brevard,2004,521422,208239,313183,37912,106765.0,295853.0,173318.0,54.77
96,Brevard,2003,507810,204689,303121,35280,88276.0,240490.0,143272.0,46.73
97,Brevard,2002,494102,198121,295981,33913,78598.0,213713.0,127895.0,56.31
98,Brevard,2001,485178,193121,292057,32798,71607.0,194480.0,117156.0,57.03


In [141]:
M_df.dtypes

county                          object
year                             int64
totalpopulation                  int64
unincorporatedpop                int64
incorporatedpop                  int64
average_yearly_wage              int64
lowtiermedianhomevalue         float64
hightiermedianhomevalue        float64
singlefamilymedianhomevalue    float64
value_inches                   float64
dtype: object

In [142]:
M_df.count()

county                         1340
year                           1340
totalpopulation                1340
unincorporatedpop              1340
incorporatedpop                1340
average_yearly_wage            1340
lowtiermedianhomevalue         1304
hightiermedianhomevalue        1304
singlefamilymedianhomevalue    1304
value_inches                   1340
dtype: int64

In [143]:
for column in M_df.columns:
    print(f"Column {column} has {M_df[column].isnull().sum()} null values")

Column county has 0 null values
Column year has 0 null values
Column totalpopulation has 0 null values
Column unincorporatedpop has 0 null values
Column incorporatedpop has 0 null values
Column average_yearly_wage has 0 null values
Column lowtiermedianhomevalue has 36 null values
Column hightiermedianhomevalue has 36 null values
Column singlefamilymedianhomevalue has 36 null values
Column value_inches has 0 null values


In [144]:
M_df = M_df.dropna()

In [145]:
M_df.count()

county                         1304
year                           1304
totalpopulation                1304
unincorporatedpop              1304
incorporatedpop                1304
average_yearly_wage            1304
lowtiermedianhomevalue         1304
hightiermedianhomevalue        1304
singlefamilymedianhomevalue    1304
value_inches                   1304
dtype: int64

In [146]:
print(f"Duplicate entries: {M_df.duplicated().sum()}")

Duplicate entries: 0


# CREATE DF FROM INDENTIFYING COLUMNS

In [147]:
CountyYrs_df = M_df.drop(columns=["unincorporatedpop", "incorporatedpop", "lowtiermedianhomevalue", "hightiermedianhomevalue", "value_inches", "totalpopulation", "average_yearly_wage", "singlefamilymedianhomevalue"])
CountyYrs_df.head(10)

Unnamed: 0,county,year
0,Alachua,2019
1,Alachua,2018
2,Alachua,2017
3,Alachua,2016
4,Alachua,2015
5,Alachua,2014
6,Alachua,2013
7,Alachua,2010
8,Alachua,2009
9,Alachua,2008


# REMOVE SUBCOLUMNS/IDENTIFYING COLUMNS

In [148]:
M_df.drop(columns=["county","year","unincorporatedpop","incorporatedpop"], inplace=True)
M_df.head(10)

Unnamed: 0,totalpopulation,average_yearly_wage,lowtiermedianhomevalue,hightiermedianhomevalue,singlefamilymedianhomevalue,value_inches
0,267306,49116,123073.0,314621.0,208584.0,52.82
1,263291,47858,114833.0,300367.0,197879.0,66.46
2,260003,45705,107574.0,288320.0,187389.0,63.47
3,257062,44257,98319.0,268422.0,173570.0,46.39
4,254893,43598,92215.0,256906.0,164952.0,52.27
5,250730,42691,88593.0,248717.0,158633.0,58.97
6,248002,41588,83601.0,235069.0,149074.0,52.13
7,247336,39688,102223.0,269990.0,172110.0,45.0
8,256232,39080,110449.0,288381.0,183571.0,53.1
9,252388,37469,125463.0,325363.0,206437.0,45.17


In [149]:
# Output clean data.
file_path = "Master_data_ML.csv"
M_df.to_csv(file_path, index=False)

In [167]:
print(M_df.shape)

(1304, 6)


# K-Means Functions

In [168]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to df_iris
    df["class"] = model.labels_

In [169]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()

    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Fit the model
    model.fit(data)

    # Predict clusters
    predictions = model.predict(data)

    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_

    return data

# Plotting w/Columns

In [176]:
# Use new instance for 3D plot
Mcol_df= M_df

#### Elbow Curve

In [177]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(Mcol_df)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

#### 2D Plotting

In [174]:
# Plug and Play X and Y values
test_cluster_amount(Mcol_df, 4)
Mcol_df.hvplot.scatter(x="value_inches", y="totalpopulation", by="class")

#### 3D Plotting w/Columns

In [179]:
# Plot the 3D-scatter with SPECIFIC columns.
fig = px.scatter_3d(
    Mcol_df,
    x="value_inches",
    y="totalpopulation",
    z="average_yearly_wage",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

# Plotting w/PCA

In [151]:
data_scaler = StandardScaler()
M_df_scaled = data_scaler.fit_transform(M_df)

In [152]:
print(M_df_scaled.shape)
M_df_scaled

(1304, 6)


array([[-0.05000515,  2.05393539,  0.53976154,  0.15178838,  0.5278608 ,
        -0.12275861],
       [-0.05881124,  1.87195192,  0.37723456,  0.06205819,  0.39625442,
         1.49625724],
       [-0.06602279,  1.5604969 ,  0.23405697, -0.01377874,  0.26729123,
         1.14135567],
       ...,
       [-0.58206601, -0.55545876, -1.0473026 , -0.93291829, -1.04327722,
         1.80368034],
       [-0.5819388 , -0.83812625, -0.88400638, -0.75861359, -0.84809899,
         0.80900638],
       [-0.58426369, -0.89888385, -0.71005911, -0.55732814, -0.6235383 ,
        -1.29785076]])

In [153]:
pca = PCA(n_components=3)
M_pca = pca.fit_transform(M_df_scaled)
M_pca.shape

(1304, 3)

In [154]:
M_pca_df = pd.DataFrame(M_pca, columns=["PCA1", "PCA2", "PCA3"])
M_pca_df.head(10)

Unnamed: 0,PCA1,PCA2,PCA3
0,1.377433,0.67345,-0.34983
1,1.09186,1.286535,1.140871
2,0.800285,1.071487,0.836867
3,0.501036,0.299864,-1.041471
4,0.303261,0.586589,-0.408643
5,0.139807,0.867162,0.330274
6,-0.065595,0.545193,-0.419758
7,0.285916,-0.121885,-1.068815
8,0.461004,0.115147,-0.131121
9,0.796038,-0.560865,-0.875213


#### ELBOW CURVE W/PCA

In [155]:
inertia = []
k = list(range(1, 11))

In [156]:
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(M_pca_df)
    inertia.append(km.inertia_)

In [170]:
M_elbow = {"k": k, "inertia": inertia}
M_elbow_df = pd.DataFrame(M_elbow)
M_elbow_df.hvplot.line(x="k", y="inertia", title="PCA Elbow Curve", xticks=k)

#### 3D PLOTTING W/PCA

In [159]:
M_testK = get_clusters(5, M_pca_df)
M_testK.head(10)

Unnamed: 0,PCA1,PCA2,PCA3,class
0,1.377433,0.67345,-0.34983,4
1,1.09186,1.286535,1.140871,4
2,0.800285,1.071487,0.836867,4
3,0.501036,0.299864,-1.041471,4
4,0.303261,0.586589,-0.408643,4
5,0.139807,0.867162,0.330274,4
6,-0.065595,0.545193,-0.419758,4
7,0.285916,-0.121885,-1.068815,4
8,0.461004,0.115147,-0.131121,4
9,0.796038,-0.560865,-0.875213,4


In [160]:
fig = px.scatter_3d(
    M_testK,
    x="PCA1",
    y="PCA2",
    z="PCA3",
    color="class",
    symbol="class",
    width=900,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()