In [1]:
import pandas as pd
import zipfile
from dash import Dash
import plotly.express as px
from dash import Input, Output, dcc, html
from jupyter_dash import JupyterDash
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [4]:
def wrangle(zip_file_path):

    """Read SCF data file into ``DataFrame``.

    Returns only credit fearful individuals .

    Parameters
    ----------
    filepath : str
        Location of CSV file.
    """
    with zipfile.ZipFile(zip_file_path, 'r') as z:
        with z.open(z.namelist()[0]) as f:
            df = pd.read_csv(f)
    return df

In [5]:
df = wrangle('scfp2022excel.zip')

In [6]:
mask = df["TURNFEAR"] == 1
df_fear = df[mask]

In [7]:
print(df_fear.shape)
df_fear.head()

(3839, 357)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
20,5,51,7191.481109,2,19,1,8,2,1,0,...,1,3,1,2,1,2,5,4,2,2
21,5,52,7352.487205,2,19,1,8,2,1,0,...,1,3,1,2,1,2,5,4,2,2
22,5,53,7270.703541,2,19,1,8,2,1,0,...,1,3,1,3,1,2,5,5,2,2
23,5,54,7383.866597,2,19,1,8,2,1,0,...,1,3,1,3,1,2,5,5,2,2
24,5,55,7330.537669,2,19,1,8,2,1,0,...,1,3,1,2,1,2,5,4,2,2


# Build Dashboard

In [8]:
# Instantiate a JupyterDash
app = JupyterDash(__name__)

print("app type:", type(app))

app type: <class 'jupyter_dash.jupyter_app.JupyterDash'>



JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



In [9]:
app.layout = html.Div(
    [
        #application title
        html.H1("Survey of Consumer Finances"),
        #bar chart
        html.H2("High Variance Features"),
        #placeholder for variance graph/chart
        dcc.Graph(id="bar-chart"),
        #Radio button to switch from trimmed to non trimmed
        dcc.RadioItems(
            options=[
                {"label": "Trimmed", "value": True},
                {"label": "Not-Trimmed", "value": False}
            ],
            value=True,
            id="trim-button"
        ),
        # K-Means slider
        html.H2("K-Means Clustering"),
        html.H3("Number of Clusters (K)"),
        dcc.Slider(min=2, max=10, step=1, value=2, id="K-slider"),
        #add a section
        html.Div(id="metrics"),
        #placeholder for pca graph/chart
        dcc.Graph(id="pca-scatter")
    ]
)

## Variance bar chart

In [10]:
def get_high_var_features(trimmed=True, return_feat_name=True):
    #calculate variance
    if trimmed:
        top_five_features=(
            df_fear.apply(trimmed_var).sort_values().tail(5)
        )
    else:
        top_five_features= df_fear.var().sort_values().tail(5)
     #extract names
    if return_feat_name:
        top_five_features= top_five_features.index.tolist()    
        
    return top_five_features

In [None]:
get_high_var_features(trimmed=True, return_feat_name=True)

In [11]:
#callback for high variance
@app.callback(
    Output("bar-chart", "figure"), 
    Input("trim-button", "value")
)
def serve_bar_chart(trimmed=True):
    #Get features
    top_five_features = get_high_var_features(trimmed=trimmed, return_feat_name=False)
    #build bar chart
    fig= px.bar(x= (top_five_features)/1e6, y=top_five_features.index, orientation="h")
    fig.update_layout(xaxis_title="variance", yaxis_title="features")
    return fig

## K-means Slider and Metrics

In [12]:
def get_model_metrics(trimmed= True, K=2, return_metrics=False):
    #get high var features
    features = get_high_var_features(trimmed= trimmed, return_feat_name=True)
    #create features
    X = df_fear[features]
    
    #build model
    model = make_pipeline(
        StandardScaler(),
        KMeans(n_clusters=K, random_state=42)
    )
    model.fit(X)
    if return_metrics:
        #calculate inertia
        i = model.named_steps["kmeans"].inertia_
        #calculate silhouette score
        ss = silhouette_score(X, model.named_steps["kmeans"].labels_)
        
        #put result in a dictionary
        metrics={
            "inertia": round(i),
            "sillouette": round(ss, 3)
        }
        #return to the user
        return metrics
    
    return model

In [None]:
get_model_metrics(trimmed=True, K=5, return_metrics=True)

In [13]:
#callback for k-means
@app.callback(
    Output("metrics", "children"),
    Input("trim-button", "value"),
    Input("K-slider", "value")
)
def serve_metrics(trimmed=True, K=2):
    #get metrics
    metrics = get_model_metrics(trimmed=trimmed, K=K, return_metrics=True)
    #add metrics to html element
    text = [
        html.H3(f"inertia: {metrics['inertia']}"),
        html.H3(f"silhouette score: {metrics['sillouette']}")
    ]
    return text

In [None]:
serve_metrics()

## PCA Scatter Plot

In [14]:
def get_pca_labels(trimmed=True, K=2):
    #create feature matrics
    features= get_high_var_features(trimmed=trimmed, return_feat_name=True)
    X = df_fear[features]
    
    #transformer
    transformer = PCA(n_components=2, random_state=42)
    x_t = transformer.fit_transform(X)
    X_pca = pd.DataFrame(x_t, columns=["PC1", "PC2"])
    
    #add labels
    model = get_model_metrics(trimmed=trimmed, K=K, return_metrics=False)
    X_pca["labels"]= model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values("labels", inplace=True)
    return X_pca

In [None]:
get_pca_labels().tail()

In [15]:
#callback for pca scatter plot
@app.callback(
    Output("pca-scatter", "figure"),
    Input("trim-button", "value"),
    Input("K-slider", "value")
)

def serve_scatter_plot(trimmed=True, K=2):
    fig=px.scatter(
        data_frame=get_pca_labels(trimmed=trimmed, K=K),
        x="PC1",
        y="PC2",
        color="labels",
        title="PCA representation of clusters"
    )
    fig.update_layout(xaxis_title="PC1", yaxis_title="PC2")
    return fig

In [16]:
if __name__ == '__main__':
    app.run_server(debug=True)

Dash app running on http://127.0.0.1:8050/
