In [11]:
import pandas as pd
import plotly.express as px
from dash import Input, Output, dcc, html
from jupyter_dash import JupyterDash
from scipy.stats.mstats import trimmed_var
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [12]:
def wrangle(filepath):


    df = pd.read_csv(filepath)
    mask1 = df["TURNFEAR"] == 1
    mask2 = df["NETWORTH"] < 2 * 10e6
    df = df[mask1 & mask2]
    return df

In [13]:
df = wrangle("SCFP2019.csv")
print(df.shape)
df.head()

(4573, 351)


Unnamed: 0,YY1,Y1,WGT,HHSEX,AGE,AGECL,EDUC,EDCL,MARRIED,KIDS,...,NWCAT,INCCAT,ASSETCAT,NINCCAT,NINC2CAT,NWPCTLECAT,INCPCTLECAT,NINCPCTLECAT,INCQRTCAT,NINCQRTCAT
5,2,21,3790.476607,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
6,2,22,3798.868505,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,3,2,2
7,2,23,3799.468393,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
8,2,24,3788.076005,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2
9,2,25,3793.066589,1,50,3,8,2,1,3,...,1,2,1,2,1,1,4,4,2,2


In [14]:
app = JupyterDash(__name__)

In [22]:
app.layout = html.Div(
    [
        #Application Title
        html.H1("Survey of Consumer Finances"),
        #Bar Chart Element
        html.H2("High Variance Features"),
        #Graph
        dcc.Graph(figure= serve_bar_chart(), id = "bar_chart"),
        dcc.RadioItems(
            options= [
                {"label": "Trimmed", "value": True},
                {"label": "Not Trimmed", "value": False}
            ],
            value = True,
            id = "trim-button"
        ),
        #Slider
        html.H2("K-means Clustering"),
        html.H3("Number of Clusters (k)"),
        dcc.Slider(min = 2, max = 12, step = 1, value = 2, id = "k-slider"),
        #Metrics
        html.Div(id = "metrics"),
        #PCA plot
        dcc.Graph(id = "pca-scatter")
       
    ]
)

In [16]:
#function to get top 5 high variance features
def get_high_var_features(trimmed = True, return_feat_names = True):

    
    #Calculating Variance
    
    if trimmed:
        top_five_features = df.apply(trimmed_var).sort_values().tail()
    else:
        top_five_features = df.var().sort_values().tail()
    
    #Extracting names
        
    if return_feat_names:
        top_five_features = top_five_features.index.to_list()
    else:
        pass
    
    return top_five_features

In [17]:
@app.callback(
    Output("bar_chart", "figure"), Input("trim-button", "value")
)
def serve_bar_chart(trimmed = True):

    

    top_five_features = get_high_var_features(trimmed = trimmed, return_feat_names= False)
    fig = px.bar(
        x = top_five_features, y = top_five_features.index, orientation= "h"
    )
    fig.update_layout(xaxis_title = "Variance", yaxis_title = "Feature")
    return fig


In [18]:
def get_model_metrics(trimmed = True, k = 2, return_metrics = False):

    
    #Getting features
    features = get_high_var_features(trimmed = trimmed, return_feat_names= True)
    
    #creating feature matrix
    X = df[features]
    
    #building model
    model = make_pipeline(
        StandardScaler(),
        KMeans(n_clusters = k, random_state=7)
    )
    
    model.fit(X)
    
    if return_metrics:
        #Calculating Inertia
        inertia = model.named_steps["kmeans"].inertia_
        #Calculating Silhouette score
        sil_score = silhouette_score(X, model.named_steps["kmeans"].labels_)
        #putting results into dictionry
        metrics = {
            "inertia": round(inertia),
            "silhouette": round(sil_score, 3)
        }
        #returning dictionary to user
        return metrics
    return model

In [19]:
@app.callback(
    Output("metrics", "children"), Input("trim-button", "value"), Input("k-slider", "value")
)
def serve_metrics(trimmed = True, k = 2):

    
    #Getting metrics
    metrics = get_model_metrics(trimmed = trimmed, k = k, return_metrics=True)
    #Adding metrics to html element
    text = [
        html.H3(f"inertia: {metrics['inertia']}"),
        html.H3(f"Silhouette Score: {metrics['silhouette']}")
    ]
    
    return text
    

In [20]:
def get_pca_labels(trimmed = True, k = 2):

    
    #Feature Matrix
    features = get_high_var_features(trimmed = trimmed, return_feat_names= True)
    X = df[features]
    #Transformer
    transformer = PCA(n_components = 2, random_state= 7)
    #Transform
    X_t = transformer.fit_transform(X)
    X_pca = pd.DataFrame(X_t, columns = ["PC1", "PC2"])
    
    model = get_model_metrics(trimmed = trimmed, k = k, return_metrics= False)
    X_pca["labels"] = model.named_steps["kmeans"].labels_.astype(str)
    X_pca.sort_values(by = "labels", inplace = True)
    return X_pca

In [21]:
@app.callback(
    Output("pca-scatter", "figure"), Input("trim-button", "value"), Input("k-slider", "value")
)
def serve_scatter_plot(trimmed = True, k = 2):

    """Build 2D scatter plot of ``df`` with ``KMeans`` labels.

    Parameters
    ----------
    trimmed : bool, default=True
        If ``True``, calculates trimmed variance, removing bottom and top 10%
        of observations.

    k : int, default=2
        Number of clusters.
    """
    fig = px.scatter(
        data_frame = get_pca_labels(trimmed = trimmed, k = k),
        x = "PC1",
        y = "PC2",
        color = "labels",
        title = "PCA representation of clusters"
    )
    fig.update_layout(xaxis_title = "PC1", yaxis_title = "PC2")
    
    return fig

In [23]:
app.run_server(mode = "inline")