# Clustering
## Consumer data in the United States

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv("data/SCFP2019.csv.gz")
print("df shape:", df.shape)
df.head()

In [None]:
# Let's calculate the proportion that are business owners
prop_biz_owners = df["HBUS"].sum()/len(df["HBUS"])
print("proportion of business owners in df:", prop_biz_owners)

In [None]:
# Create a dataframe that shows the normalized frequency differencing business and non-business owners
inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (df["INCCAT"]
             .replace(inccat_dict)
             .groupby(df["HBUS"])
             .value_counts(normalize=True)
             .rename("frequency")
             .to_frame()
             .reset_index()
)

df_inccat

In [None]:
# Create bar chart of `df_inccat`
sns.barplot(
    x="INCCAT",
    y="frequency",
    hue="HBUS",
    data=df_inccat,
    order=inccat_dict.values()
)
plt.xlabel("Income Category")
plt.ylabel("Frequency (%)")
plt.title("Income Distribution: Business Owners vs. Non-Business Owners");

In [None]:
# Plot "HOUSES" vs "DEBT" with hue as business ownership
sns.scatterplot(x=df["DEBT"] / 1e8, y=df["HOUSES"] / 1e8, hue=df["HBUS"])
plt.xlabel("Household Debt")
plt.ylabel("Home Value")
plt.title("Home Value vs. Household Debt");

In [None]:
# Create a new dataframe that contains only business owners whose income is below $500,000.
mask = (df["INCOME"] < 500_000) & (df["HBUS"] == 1)
df_small_biz = df[mask]
print("df_small_biz shape:", df_small_biz.shape)
df_small_biz.head()

In [None]:
# Plot histogram of "AGE"
df_small_biz["AGE"].plot(kind="hist", bins=10)
plt.xlabel("Age")
plt.ylabel("Frequency (count)")
plt.title("Small Business Owners: Age Distribution");

In [None]:
# Calculate variance, get 10 largest features
top_ten_var = df_small_biz.var().sort_values().tail(10)
top_ten_var

In [None]:
# Calculate trimmed variance
top_ten_trim_var = df_small_biz.apply(trimmed_var, limits= (0.1, 0.1)).sort_values().tail(10)
top_ten_trim_var

In [None]:
# Create horizontal bar chart of `top_ten_trim_var`
fig = px.bar(
    x=top_ten_trim_var,
    y=top_ten_trim_var.index,
    title="Small Business Owners: High Variance Features"
)
fig.update_layout(xaxis_title="Trimmed Variance [$]", yaxis_title="Feature")

In [None]:
# List with the highest 5 trimmed variance
high_var_cols = top_ten_trim_var.tail(5).index.to_list()
high_var_cols

In [None]:
# Creating the dataframe which I will use for the model
X = df_small_biz[high_var_cols]
print("X shape:", X.shape)
X.head()

In [None]:
# Cluster with K-Means to understand which number of clusters is better, based on inertia and silhouette scores
n_clusters = range(2,13)
inertia_errors = []
silhouette_scores = []

# Add `for` loop to train model and calculate inertia, silhouette score.
for k in n_clusters:
    # Build model
    model = make_pipeline(StandardScaler(), KMeans(n_clusters=k, random_state=42))
    # Train model
    model.fit(X)
    # Calculate inertia
    inertia_errors.append(model.named_steps["kmeans"].inertia_)
    # Silhouette score
    silhouette_scores.append(
        silhouette_score(X, model.named_steps["kmeans"].labels_)
    )

print("Inertia:", inertia_errors[:11])
print()
print("Silhouette Scores:", silhouette_scores[:3])

In [None]:
# Create line plot of `inertia_errors` vs `n_clusters`
fig = px.line(
    x=n_clusters, y=inertia_errors, title="K-Means Model: Inertia vs Number of Clusters"
)
fig.update_layout(xaxis_title="Number of Clusters", yaxis_title="Inertia")

In [None]:
# Create a line plot of `silhouette_scores` vs `n_clusters`
fig = px.line(
    x=n_clusters,
    y=silhouette_scores,
    title="K-Means Model: Silhouette Score vs Number of Clusters"
)
fig.update_layout(xaxis_title="Number of Clusters", yaxis_title="Silhouette Score")

In [None]:
# Create the final model with 3 clusters (optimal number of clusters)
final_model = make_pipeline(StandardScaler(), KMeans(n_clusters=3, random_state=42))
final_model.fit(X)

In [None]:
# Let's see the labels of the clusters
labels = final_model.named_steps["kmeans"].labels_
xgb = X.groupby(labels).mean()
xgb

In [None]:
# Create side-by-side bar chart of `xgb`
fig = px.bar(
    xgb,
    barmode="group",
    title="Small Business Owner Finances by Cluster",
)

fig.update_layout(xaxis_title="Cluster", yaxis_title="Value [$]")

In [None]:
# Let's reduce the dimensionality of X and convert it to a DataFrame
pca = PCA(n_components=2, random_state=42)

# Transform `X`
X_t = pca.fit_transform(X)

# Put `X_t` into DataFrame
X_pca = pd.DataFrame(X_t, columns=["PC1", "PC2"])

print("X_pca shape:", X_pca.shape)
X_pca.head()

In [None]:
# Create scatter plot of `PC2` vs `PC1`
fig = px.scatter(
    data_frame=X_pca,
    x="PC1",
    y="PC2",
    color=labels.astype(str),
    title="PCA Representation of Clusters"
)

fig.update_layout(xaxis_title="PC1", yaxis_title="PC2")