In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set_style("whitegrid")

In [None]:
import os
os.chdir("C:/Users/USER/Documents/my_DS_projects/UNSUPERVISED/credit-fear-clustering")
print("Current Directory:", os.getcwd())

In [None]:
# Load Data
df = pd.read_csv("SCFP2019.csv")

# Display basic information
print("Dataset Shape:", df.shape)
df.head()

# Explore

In [None]:
# Filter households that are credit fearful
mask = df['TURNFEAR'] == 1
df_fear = df[mask]
print("Credit-Fearful Households Shape:", df_fear.shape)

## Age Analysis

In [None]:
agecl_dict = {
    1: "Under 35",
    2: "35-44",
    3: "45-54",
    4: "55-64",
    5: "65-74",
    6: "75 or Older",
}

age_cl = df_fear['AGECL'].replace(agecl_dict)
print("age_cl type:", type(age_cl))
print("age_cl shape:", age_cl.shape)
age_cl.head()

In [None]:
# Age group mapping
agecl_dict = {
    1: "Under 35", 2: "35-44", 3: "45-54",
    4: "55-64", 5: "65-74", 6: "75 or Older"
}

df_fear = df_fear.copy()  # Ensure df_fear is a separate DataFrame
df_fear['AGE_GROUP'] = df_fear['AGECL'].replace(agecl_dict)

print("df_fear type:", type(df_fear))
print("df_fear shape:", df_fear.shape)
df_fear.head()

In [None]:
age_cl_value_counts = age_cl.value_counts()

# Bar plot of `age_cl_value_counts`
age_cl_value_counts.plot(
    kind = 'bar',
    xlabel = 'Age Group',
    ylabel = 'Frequency [count]',
    title = 'Credict Fearful: Age Groups'
);

In [None]:
# Age group distribution
plt.figure(figsize=(8,5))
df_fear['AGE_GROUP'].value_counts().plot(kind='bar', color='skyblue')
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.title("Credit Fearful: Age Groups")
plt.xticks(rotation=45)
plt.show()

Our chart is telling us that many of the people who fear being denied credit are younger. 

In [None]:
# Plot histogram of "AGE"
df_fear['AGE'].hist(bins = 10)
plt.xlabel('Age')
plt.ylabel('Frequency counts')
plt.title('Credit Fearful: Age Distribution');

In [None]:
# Plot histogram of "AGE"
df_fear['AGE'].hist(bins = 10)
plt.xlabel('Age')
plt.ylabel('Frequency counts')
plt.title('Credit Fearful: Age Distribution');

It looks like younger people are still more concerned about being able to secure a loan than older people, but the people who are most concerned seem to be between 30 and 40

## RACE Analysis

In [None]:
race_dict = {
    1: "White/Non-Hispanic",
    2: "Black/African-American",
    3: "Hispanic",
    5: "Other",
}
race = df_fear['RACE'].replace(race_dict)
race_value_counts = race.value_counts(normalize = True)
# Create bar chart of race_value_counts
race_value_counts.plot(kind = 'barh')
plt.xlim((0, 1))
plt.xlabel("Frequency (%)")
plt.ylabel("Race")
plt.title("Credit Fearful: Racial Groups");

In [None]:
race = df['RACE'].replace(race_dict)
race_value_counts = race.value_counts(normalize = True)
# Create bar chart of race_value_counts
race_value_counts.plot(kind = 'barh')
plt.xlim((0, 1))
plt.xlabel("Frequency (%)")
plt.ylabel("Race")
plt.title("SCF Respondents: Racial Groups");

In [None]:
race_dict = {1: "White/Non-Hispanic", 2: "Black/African-American", 3: "Hispanic", 5: "Other"}
df_fear['RACE_GROUP'] = df_fear['RACE'].replace(race_dict)

plt.figure(figsize=(8,5))
df_fear['RACE_GROUP'].value_counts(normalize=True).plot(kind='bar', color='lightcoral')
plt.xlabel("Race")
plt.ylabel("Proportion")
plt.title("Credit Fearful: Racial Groups")
plt.show()

### Recreate the  bar chart i just made, but this time use the entire dataset df instead of the subset df_fear

In [None]:
race = df['RACE'].replace(race_dict)
race_value_counts = race.value_counts(normalize = True)
# Create bar chart of race_value_counts
race_value_counts.plot(kind = 'barh')
plt.xlim((0, 1))
plt.xlabel("Frequency (%)")
plt.ylabel("Race")
plt.title("SCF Respondents: Racial Groups");

## Income

In [None]:
inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df['INCCAT']
    .replace(inccat_dict)
    .groupby(df['TURNFEAR'])
    .value_counts(normalize = True)
    .rename('frequency')
    .to_frame()
    .reset_index()
)

print("df_inccat type:", type(df_inccat))
print("df_inccat shape:", df_inccat.shape)
df_inccat

In [None]:
inccat_dict = {
    1: "0-20",
    2: "21-39.9",
    3: "40-59.9",
    4: "60-79.9",
    5: "80-89.9",
    6: "90-100",
}

df_inccat = (
    df['INCCAT']
    .replace(inccat_dict)
    .groupby(df['TURNFEAR'])
    .value_counts(normalize = True)
    .rename('frequency')
    .to_frame()
    .reset_index()
)

print("df_inccat type:", type(df_inccat))
print("df_inccat shape:", df_inccat.shape)
df_inccat

In [None]:
# Create bar chart of `df_inccat`
sns.barplot(
    x = 'INCCAT',
    y = 'frequency',
    hue = 'TURNFEAR',
    data = df_inccat,
    order = inccat_dict.values()
    
)
plt.xlabel("Income Category")
plt.ylabel("Frequency (%)")
plt.title("Income Distribution: Credit Fearful vs. Non-fearful");

Comparing the income categories across the fearful and non-fearful groups, we can see that credit fearful households are much more common in the lower income categories. In other words, the credit fearful have lower incomes.

In [None]:
inccat_dict = {
    1: "0-20", 2: "21-39.9", 3: "40-59.9", 4: "60-79.9", 5: "80-89.9", 6: "90-100"
}
df_fear['INCOME_CATEGORY'] = df_fear['INCCAT'].replace(inccat_dict)

plt.figure(figsize=(8,5))
sns.countplot(x='INCOME_CATEGORY', data=df_fear, order=inccat_dict.values(), palette='coolwarm')
plt.xlabel("Income Category")
plt.ylabel("Count")
plt.title("Income Distribution: Credit Fearful Households")
plt.xticks(rotation=45)
plt.show()

## ASSET

In [None]:
asset_house_corr = df['ASSET'].corr(df['HOUSES'])
print("SCF: Asset Houses Correlation:", asset_house_corr)

That's a moderate positive correlation, which we would probably expect, right? 

In [None]:
asset_house_corr = df_fear['ASSET'].corr(df_fear['HOUSES'])
print("Credit Fearful: Asset Houses Correlation:", asset_house_corr)

Aha! They're different! It's still only a moderate positive correlation, but the relationship between the total value of assets and the value of the primary residence is stronger for our TURNFEAR group than it is for the population as a whole

In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(x=df_fear['DEBT'], y=df_fear['ASSET'], alpha=0.5, color='purple')
plt.xlabel("Debt")
plt.ylabel("Assets")
plt.title("Credit Fearful: Debt vs Assets")
plt.show()

## Education Analysis

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='EDUC', data=df_fear, palette='magma')
plt.xlabel("Education Level")
plt.ylabel("Count")
plt.title("Education Level: Credit Fearful Households")
plt.show()

In [None]:
df_educ = (
    df['EDUC']
    .groupby(df['TURNFEAR'])
    .value_counts(normalize = True)
    .rename('frequency')
    .to_frame()
    .reset_index()
)
print("df_educ type:", type(df_educ))
print("df_educ shape:", df_educ.shape)
df_educ.head()

In [None]:
# Create bar chart of `df_educ`
sns.barplot(
    x = 'EDUC',
    y = 'frequency',
    hue = 'TURNFEAR',
    data = df_educ
)
plt.xlabel("Education Level")
plt.ylabel("Frequency (%)")
plt.title("Educational Attainment: Credit Fearful vs. Non-fearful");

In this plot, we can see that a much higher proportion of credit-fearful respondents have only a high school diploma, while university degrees are more common among the non-credit fearful.

## Debt Analysis

In [None]:
# Create scatter plot of ASSET vs DEBT, df
df.plot.scatter(x = 'DEBT', y = 'ASSET');

In [None]:
# Create scatter plot of ASSET vs DEBT, df
df_fear.plot.scatter(x = 'DEBT', y = 'ASSET');

In [None]:
# Create scatter plot of HOUSES vs DEBT, df
df.plot.scatter(x = 'DEBT', y = 'HOUSES');

## Correlation Matrix

In [None]:
# Correlation matrix for fearful house hold
selected_cols = ["ASSET", "HOUSES", "INCOME", "DEBT", "EDUC"]
corr_matrix = df_fear[selected_cols].corr()
plt.figure(figsize=(8,5))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix: Financial Features")
plt.show()

In [None]:
# Correlatio analysis for non credit fearful

selected_cols = ["ASSET", "HOUSES", "INCOME", "DEBT", "EDUC"]
corr_matrix = df[selected_cols].corr()
plt.figure(figsize=(8,5))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix: Financial Features")
plt.show()

Whoa! There are some pretty important differences here! The relationship between "DEBT" and "HOUSES" is positive for both datasets, but while the coefficient for df is fairly weak at 0.26, the same number for df_fear is 0.96.

# CLustering

In [None]:
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import plotly.express as px

In [None]:
def wranglefilepath(filepath):

    """Read SCF data file into ``DataFrame``.

    Returns only credit fearful households whose net worth is less than $2 million.

    Parameters
    ----------
    filepath : str
        Location of CSV file.
    """
    # Load data
    df = pd.read_csv(filepath)
    # Create Mask
    mask = (df['TURNFEAR'] == 1) & (df['NETWORTH'] < 2e6)
    # Subset DataFrame
    df = df[mask]
    return df

In [None]:
df = wranglefilepath("SCFP2019.csv")

print("df type:", type(df))
print("df shape:", df.shape)
df.head()

Due to the very high number of features we have in our dataset the features to use for the cluster analysis might be a problem. One way to choose the best features for clustering is to determine which numerical features have the largest variance. 

In [None]:
# Calculate variance, get 10 largest features
top_ten_var = df.var().sort_values().tail(10)

print("top_ten_var type:", type(top_ten_var))
print("top_ten_var shape:", top_ten_var.shape)
top_ten_var

In [None]:
# Create horizontal bar chart of `top_ten_var`
fig = px.bar(
    x = top_ten_var,
    y = top_ten_var.index,
    title = 'SCF: High Variance Features'
    
)
fig.update_layout(xaxis_title = 'Variance', yaxis_title = 'Features')
fig.show()

In [None]:
# Create a boxplot of `NHNFIN`
fig = px.box(
    data_frame = df,
    x = 'NHNFIN',
    title = 'Distribution of Non-home, Non-Financial Assets'
)
fig.update_layout(xaxis_title = 'Value ($)')
fig.show()

The dataset is massively right-skewed because of the huge outliers on the right side of the distribution. Even though we already excluded households with a high net worth with our wrangle function, the variance is still being distorted by some extreme outliers.
The best way to deal with this is to look at the trimmed variance, where we remove extreme values before calculating variance.

In [None]:
# Calculate trimmed variance
top_ten_trim_var = df.apply(trimmed_var, limits = (0.1, 0.1)).sort_values().tail(10)

print("top_ten_trim_var type:", type(top_ten_trim_var))
print("top_ten_trim_var shape:", top_ten_trim_var.shape)
top_ten_trim_var

In [None]:
# Create horizontal bar chart of `top_ten_trim_var`
fig = px.bar(
    x = top_ten_trim_var,
    y = top_ten_trim_var.index,
    title = 'SCF: High Variance Features'
)
fig.update_layout(xaxis_title = 'Trimmed Variance', yaxis_title = 'Feature')
fig.show()

Here are three things to notice in this plot. First, the variances have decreased a lot. In our previous chart, the x-axis went up to $80 billion; this one goes up to $12 billion. Second, the top 10 features have changed a bit. All the features relating to business ownership ("...BUS") are gone. Finally, we can see that there are big differences in variance from feature to feature. For example, the variance for "WAGEINC" is around than $500 million, while the variance for "ASSET" is nearly $12 billion. In other words, these features have completely different scales. This is something that we'll need to address before we can make good clusters.

### Generate a list of high variance with their variance name.

In [None]:
high_var_cols = top_ten_trim_var.tail(5).index.to_list()

print("high_var_cols type:", type(high_var_cols))
print("high_var_cols len:", len(top_ten_trim_var))
high_var_cols

## Split

In [None]:
X = df[high_var_cols]

print("X type:", type(X))
print("X shape:", X.shape)
X.head()

# Build model

## Iterate

During our EDA, we saw that we had a scale issue among our features. That issue can make it harder to cluster the data, so we'll need to fix that to help our analysis along. One strategy we can use is standardization, a statistical method for putting all the variables in a dataset on the same scale. 

In [None]:
n_clusters = range(2,13)
inertia_errors = []
silhouette_scores = []

# Add `for` loop to train model and calculate inertia, silhouette score.
for k in n_clusters:
    # Build Model
    model = make_pipeline(StandardScaler(), KMeans(n_clusters = k, n_init = 10, random_state = 42))
    # Train Model
    model.fit(X)
    # Cal Inertia
    inertia_errors.append(model.named_steps['kmeans'].inertia_)
    # Cal Silhouette score
    silhouette_scores.append(
        silhouette_score(X, model.named_steps['kmeans'].labels_)
    )

print("inertia_errors type:", type(inertia_errors))
print("inertia_errors len:", len(inertia_errors))
print("Inertia:", inertia_errors)
print()
print("silhouette_scores type:", type(silhouette_scores))
print("silhouette_scores len:", len(silhouette_scores))
print("Silhouette Scores:", silhouette_scores)

### Determining number clusters

In [None]:
# Create line plot of `inertia_errors` vs `n_clusters`
fig = px.line(
    x = n_clusters, y = inertia_errors, title = 'K-Means Model: Inertia vs Number of CLusters'
)
fig.update_layout(xaxis_title = 'Number of CLusters (k)', yaxis_title = 'Inertial')
fig.show()

We can see that the line starts to flatten out around 4 or 5 clusters.

In [None]:
# Create a line plot of `silhouette_scores` vs `n_clusters`
fig = px.line(
    x = n_clusters, y = silhouette_scores, title = 'K-Means Model: Silhouette Score vs Number of Clusters'
)
fig.update_layout(xaxis_title = 'Number of Clusters (k)', yaxis_title = 'Silhouette Score')
fig.show()

This one's a little less straightforward, but we can see that the best silhouette scores occur when there are 3 or 4 clusters.
Putting the information from this plot together with our inertia plot, it seems like the best setting for n_clusters will be 4.

In [None]:
from sklearn.utils.validation import check_is_fitted
# Build model
final_model = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters = 4, n_init = 10, random_state = 42)
)

# Fit model to data
final_model.fit(X)

# Assert that model has been fit to data
check_is_fitted(final_model)

In [None]:
labels = final_model.named_steps['kmeans'].labels_

print("labels type:", type(labels))
print("labels len:", len(labels))
print(labels[:5])

In [None]:
xgb = X.groupby(labels).mean()

print("xgb type:", type(xgb))
print("xgb shape:", xgb.shape)
xgb

In [None]:
# Create side-by-side bar chart of `xgb`
fig = px.bar(
    xgb,
    barmode = 'group',
    title = 'Mean Household Finances by Cluster'
)
fig.update_layout(xaxis_title = 'Cluster', yaxis_title = 'value [$]')
fig.show()

Remember that our clusters are based partially on NETWORTH, which means that the households in the 0 cluster have the smallest net worth, and the households in the 2 cluster have the highest. Based on that, there are some interesting things to unpack here.

First, take a look at the DEBT variable. You might think that it would scale as net worth increases, but it doesn't. The lowest amount of debt is carried by the households in cluster 2, even though the value of their houses (shown in green) is roughly the same. You can't really tell from this data what's going on, but one possibility might be that the people in cluster 2 have enough money to pay down their debts, but not quite enough money to leverage what they have into additional debts. The people in cluster 3, by contrast, might not need to worry about carrying debt because their net worth is so high.

Finally, since we started out this project looking at home values, take a look at the relationship between DEBT and HOUSES. The value of the debt for the people in cluster 0 is higher than the value of their houses, suggesting that most of the debt being carried by those people is tied up in their mortgages — if they own a home at all. Contrast that with the other three clusters: the value of everyone else's debt is lower than the value of their homes.

So all that's pretty interesting, but it's different from what we did last time, right? At this point in the last lesson, we made a scatter plot. This was a straightforward task because we only worked with two features, so we could plot the data points in two dimensions. But now X has five dimensions! How can we plot this to give stakeholders a sense of our clusters?

In [None]:
from sklearn.decomposition import PCA
# Instantiate transformer
pca = PCA(n_components = 2, random_state = 42)

# Transform `X`
X_t = pca.fit_transform(X)

# Put `X_t` into DataFrame
X_pca = pd.DataFrame(X_t, columns = ['PC1', 'PC2'])

print("X_pca type:", type(X_pca))
print("X_pca shape:", X_pca.shape)
X_pca.head()

In [None]:
# Create scatter plot of `PC2` vs `PC1`
fig = px.scatter(
    data_frame = X_pca,
    x = 'PC1',
    y = 'PC2',
    color = labels.astype(str),
    title = 'PCA Representation of Clusters'
)
fig.update_layout(xaxis_title = 'PC1', yaxis_title = 'PC2')
fig.show()

# Creating Interactive Dashboard

In [135]:
from dash import Input, Output, dcc, html, Dash
import pandas as pd
import plotly.express as px
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from scipy.stats.mstats import trimmed_var

In [136]:
app = Dash(__name__)
print("app type:", type(app))

app type: <class 'dash.dash.Dash'>


## Build Dashboard

### App Layout

In [137]:
app.layout = html.Div(
    [
        # Application title
        html.H1('Survey of Consumer Finances '),
        
        # Bar chart section
        html.H2('High Variance Features'),
        dcc.Graph(id='bar-chart'),
        
        # Trimmed/Not Trimmed selection
        html.Label('Select Variance Calculation Method:'),
        dcc.RadioItems(
            options=[
                {'label': 'Trimmed', 'value': True},
                {'label': 'Not Trimmed', 'value': False}
            ],
            value=False,
            id='trim-button',
            inline=True  # Display options inline
        ),
        
        # K-means clustering section
        html.H2('K-means Clustering'),
        html.H3('Number of Clusters (k)'),
        dcc.Slider(min=2, max=12, step=1, value=2, id='k-slider'),
        html.Div(id='metrics'),
        
        # PCA scatter plot
        dcc.Graph(id='pca-scatter')
    ]
)


### Variance Bar Chart

In [138]:
def get_high_var_features(trimmed = True, return_feat_names = True):

    """Returns the five highest-variance features of ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    return_feat_names : bool, default=False
        If ``True``, returns feature names as a ``list``. If ``False``
        returns ``Series``, where index is feature names and values are
        variances.
    """
    # Cal variance
    if trimmed:
        top_five_features = (df.apply(trimmed_var).sort_values().tail(5))
    else:
        top_five_features = df.var().sort_values().tail(5)
    # Extract names
    if return_feat_names:
        top_five_features = top_five_features.index.tolist()
    return top_five_features

In [139]:
@app.callback(
    Output('bar-chart', 'figure'),
    Input('trim-button', 'value')
)
def serve_bar_chart(trimmed = True):

    """Returns a horizontal bar chart of five highest-variance features.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.
    """
    # Get features
    top_five_features = get_high_var_features(trimmed = trimmed, return_feat_names = False)
    # Build bar chart
    fig = px.bar(x = top_five_features, y = top_five_features.index, orientation = 'h')
    fig.update_layout(xaxis_title = 'Variance', yaxis_title = 'Feature')
    return fig

### K-Means Slider and Metrics

In [140]:
def get_model_metrics(trimmed = True, k = 2, return_metrics = False):

    """Build ``KMeans`` model based on five highest-variance features in ``df``.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.

    return_metrics : bool, default=False
        If ``False`` returns ``KMeans`` model. If ``True`` returns ``dict``
        with inertia and silhouette score.

    """
    features = get_high_var_features(trimmed = trimmed, return_feat_names = True)
    # Create feature matrix
    X = df[features]
    # Build model
    model = make_pipeline(StandardScaler(), KMeans(n_clusters = k, n_init = 'auto', random_state = 42))
    model.fit(X)
    if return_metrics:
        # cal inertia
        i = model.named_steps['kmeans'].inertia_
        # Cal silhouette score
        ss = silhouette_score(X, model.named_steps['kmeans'].labels_)
        # put result into dictionary
        metrics = {
            'inertia':round(i),
            'silhouette': round(ss, 3)
        }
        # Return dictionary to user
        return metrics

    return model

In [141]:
@app.callback(
    Output('metrics', 'children'),
    Input('trim-button', 'value'),
    Input('k-slider', 'value')
)
def serve_metrics(trimmed = True, k = 2):

    """Returns list of ``H3`` elements containing inertia and silhouette score
    for ``KMeans`` model.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Get metrics
    metrics = get_model_metrics(trimmed = trimmed, k = k, return_metrics = True)
    # Add metrics to HTML elements
    text = [
        html.H3(f"Inertia: {metrics['inertia']}"),
        html.H3(f"Silhouette Score: {metrics['silhouette']}")
    ]
    
    return text

### PCA Labels

In [142]:
def get_pca_labels(trimmed = True, k = 2):

    """
    ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    # Create feature matrix
    features = get_high_var_features(trimmed = trimmed, return_feat_names = True)
    X = df[features]
    # Build transformer
    transformer = PCA(n_components = 2, random_state = 42)
    # Transform data
    X_t = transformer.fit_transform(X)
    X_pca = pd.DataFrame(X_t, columns = ['PC1', 'PC2'])
    # Add labels
    model = get_model_metrics(trimmed = trimmed, k = k, return_metrics = False)
    X_pca['labels'] = model.named_steps['kmeans'].labels_.astype(str)
    X_pca.sort_values('labels', inplace = True)
    return X_pca

In [143]:
@app.callback(
    Output('pca-scatter', 'figure'),
    Input('trim-button', 'value'),
    Input('k-slider', 'value')
)
def serve_scatter_plot(trimmed = True, k = 2):

    """Build 2D scatter plot of ``df`` with ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    fig = px.scatter(
        data_frame = get_pca_labels(trimmed = trimmed, k = k),
        x = 'PC1',
        y = 'PC2',
        color = 'labels',
        title = 'PCA Representation of Clusters'
    )
    fig.update_layout(xaxis_title = 'PC1', yaxis_title = 'PC2')
    return fig

# Application Deployment

In [144]:
if __name__ == '__main__':
    app.run_server(debug=True)