# Principal component disaster

*Creator: Magnus Struckmann*
## Table of contents

- Notebook description
- Create data sets
- Visualize blob data
- Visualize elbow plots of explained variance from PCA
- Visualize PCA transformation fitted on training data
- Visualize PCA transformation applied on testing data
- Visualize decision boundaries and training data
- Visualize decision boundaries and test data
- Visualize surface probability and test data

### Notebook description

To analyze the consequences of changing the distance between blob data centers 30 different data sets are created.
Only the distance between the centers of the Gaussian distributed blobs is changed while the standard deviation and the random initialization parameter are kept constant.

Each data set contains 1000 data points that are equally distributed among the four data classes.
A principal component analysis is performed on each data set and a logistic regression classifier fitted to its output.
All results are saved to the *df_collection* dictionary.

The different data sets, PCA results and logistic regression predicitions are visualized with Plotly to conclude on the consequences of changing the distance between blob data centers.

In [None]:
# Import packages
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objects as go
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

### Create data sets

In [None]:
df_collection = []
for step in np.arange(0, 3, 0.1): # Step size for distance between blob centers
    
    # Create blob data
    distance = step
    centers = [
        [distance,-distance,-distance],
        [-distance,distance,-distance],
        [-distance,distance,distance],
        [distance,-distance,distance]]
    
    X, y = datasets.make_blobs(n_samples=1000,
                               random_state=42,
                               n_features=3,
                               cluster_std=1,
                               centers=centers)
    
    data = np.column_stack((X, y))
    df = pd.DataFrame(data, columns = ['Feature 1','Feature 2','Feature 3','Cluster'])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)
    
    # Standardize data
    sc = StandardScaler()
    X_train_std = sc.fit_transform(X_train)
    X_test_std = sc.transform(X_test)  
    
    # Principal component analysis
    pca = PCA(n_components=3)
    pca.fit(X_train_std)
    pca_var = pca.explained_variance_ratio_
    pca = PCA(n_components=2)
    pca_X_train = pca.fit_transform(X_train_std)
    pca_X_test = pca.transform(X_test_std)
    pca_X = np.append(pca_X_train, pca_X_test, axis=0)
    
    # Fit logistic regression model
    clf = LogisticRegression()
    clf.fit(pca_X_train, y_train)

    # Make predictions over complete PCA feature space (min and max of training and test data)
    h = .02  # step size in the mesh
    x_min, x_max = pca_X[:, 0].min() - 1, pca_X[:, 0].max() + 1
    y_min, y_max = pca_X[:, 1].min() - 1, pca_X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    y_ = np.arange(y_min, y_max, h)
    Z_class_prediction = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z_class_prediction = Z_class_prediction.reshape(xx.shape)
    Z_probability_label1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,0] # probability for class 1
    Z_probability_label1 = Z_probability_label1.reshape(xx.shape)
    
    step_dict = dict(distance=round(step,1),
                     data=df,
                     pca_var=pca_var,
                     pca_X_train=pca_X_train,
                     pca_X_test=pca_X_test,
                     y_train=y_train,
                     y_test=y_test,
                     xx_meshgrid=xx,
                     y_meshgrid=y_,
                     Z_class_prediction=Z_class_prediction,
                     Z_probability_label1=Z_probability_label1)
    
    df_collection.append(step_dict)

### Visualize blob data

In [None]:
fig = go.Figure()

# Add traces, one for each slider step
for step_dict in df_collection:
    df = step_dict['data']

    trace = go.Scatter3d(x=df['Feature 1'], y=df['Feature 2'], z=df['Feature 3'],
                          mode='markers',
                          marker=dict(size=12,color=df['Cluster']),
                          visible=False)

    fig.add_trace(trace)
        
# Make 10th trace visible
fig.data[10].visible = True

In [None]:
# Create and add slider
def create_sliders(df_collection):
    steps = []
    for i,step_dict in enumerate(df_collection):
        step = dict(
            method="restyle",
            args=["visible", [False] * len(df_collection)],
            label=str(step_dict['distance']),
        )
        step["args"][1][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)

    sliders = [dict(
        active=10,
        currentvalue={"prefix": "Distance: "},
        pad={"t": 50},
        steps=steps
    )]
    return sliders

In [None]:
# Show plot
fig.update_layout(
    sliders=create_sliders(df_collection),
    scene = dict(xaxis_title='Feature 1',
                 yaxis_title='Feature 2',
                 zaxis_title='Feature 3'),
                 margin=dict(r=20, b=10, l=10, t=10))

fig.show()

### Visualize elbow plots of explained variance from PCA

In [None]:
fig = go.Figure()

# Add traces, one for each slider step
for step_dict in df_collection:
    pca_var = step_dict['pca_var']
    
    components=['Component 1', 'Component 2', 'Component 3']
    trace = go.Bar(x=components, 
                   y=pca_var,
                   visible=False)
    
    fig.add_trace(trace)

# Make 10th trace visible
fig.data[10].visible = True

In [None]:
# Show plot
fig.update_layout(
    sliders=create_sliders(df_collection),
    yaxis=dict(title='Explained variance ratio'))

fig.show()

### Visualize PCA transformation fitted on training data

In [None]:
fig = go.Figure()

# Add traces, one for each slider step
for step_dict in df_collection:
    pca_X_train = step_dict['pca_X_train']
    y_train = step_dict['y_train']

    trace = go.Scatter(x=pca_X_train[:,0], y=pca_X_train[:,1],
                       mode='markers',
                       marker=dict(size=12,color=y_train),
                       visible=False)

    fig.add_trace(trace)
        
# Make 10th trace visible
fig.data[10].visible = True

In [None]:
# Show plot
fig.update_layout(
    sliders=create_sliders(df_collection),
    xaxis=dict(title='Component 1'),
    yaxis=dict(title='Component 2'))

fig.show()

### Visualize PCA transformation applied on testing data

In [None]:
fig = go.Figure()

# Add traces, one for each slider step
for step_dict in df_collection:
    pca_X_test = step_dict['pca_X_test']
    y_test = step_dict['y_test']

    trace = go.Scatter(x=pca_X_test[:,0], y=pca_X_test[:,1],
                       mode='markers',
                       marker=dict(size=12,color=y_test),
                       visible=False)

    fig.add_trace(trace)
        
# Make 10th trace visible
fig.data[10].visible = True

In [None]:
# Show plot
fig.update_layout(
    sliders=create_sliders(df_collection),
    xaxis=dict(title='Component 1'),
    yaxis=dict(title='Component 2'))

fig.show()

### Visualize decision boundaries and training data

In [None]:
fig = go.Figure()

# Add traces, one for each slider step
for step_dict in df_collection:
    X = step_dict['pca_X_train']
    y = step_dict['y_train']
    xx = step_dict['xx_meshgrid']    
    y_ = step_dict['y_meshgrid']
    Z = step_dict['Z_class_prediction']
    
    trace = go.Heatmap(x=xx[0], y=y_, z=Z,
                       showscale=True,
                       visible=False)
    
    fig.add_trace(trace)

    trace = go.Scatter(x=X[:, 0], y=X[:, 1], 
                       mode='markers',
                       showlegend=False,
                       marker=dict(size=10,color=y,line=dict(color='black', width=1)),
                       visible=False)

    fig.add_trace(trace)
        
# Make 10th trace visible
fig.data[19].visible = True # (because we add two traces per plot we need to multiply by two and make both visible)
fig.data[20].visible = True

In [None]:
# Create and add slider (two traces per plot)
def create_sliders_2traces(df_collection):
    steps = []
    for i,step_dict in enumerate(df_collection):
        step = dict(
            method="restyle",
            args=["visible", [False] * len(df_collection)*2],
            label=str(step_dict['distance']),
        )
        step["args"][1][i*2] = True  # Toggle i'th trace to "visible"
        step["args"][1][i*2+1] = True
        steps.append(step)

    sliders = [dict(
        active=10,
        currentvalue={"prefix": "Distance: "},
        pad={"t": 50},
        steps=steps
    )]
    return sliders

In [None]:
# Show plot
fig.update_layout(   
    sliders=create_sliders_2traces(df_collection),
    xaxis=dict(title='Component 1'),
    yaxis=dict(title='Component 2'))

fig.show()

### Visualize decision boundaries and test data

In [None]:
fig = go.Figure()

# Add traces, one for each slider step
for step_dict in df_collection:
    X = step_dict['pca_X_test']
    y = step_dict['y_test']
    xx = step_dict['xx_meshgrid']    
    y_ = step_dict['y_meshgrid']
    Z = step_dict['Z_class_prediction']
    
    trace = go.Heatmap(x=xx[0], y=y_, z=Z,
                       showscale=True,
                       visible=False)
    
    fig.add_trace(trace)

    trace = go.Scatter(x=X[:, 0], y=X[:, 1], 
                       mode='markers',
                       showlegend=False,
                       marker=dict(size=10,color=y,line=dict(color='black', width=1)),
                       visible=False)

    fig.add_trace(trace)
        
# Make 10th trace visible
fig.data[19].visible = True # (because we add two traces per plot we need to multiply by two and make both visible)
fig.data[20].visible = True

In [None]:
# Show plot
fig.update_layout(   
    sliders=create_sliders_2traces(df_collection),
    xaxis=dict(title='Component 1'),
    yaxis=dict(title='Component 2'))

fig.show()

### Visualize surface probability and test data

In [None]:
fig = go.Figure()

# Add traces, one for each slider step
for step_dict in df_collection:
    X = step_dict['pca_X_test']
    y = step_dict['y_test']
    xx = step_dict['xx_meshgrid']    
    y_ = step_dict['y_meshgrid']
    Z = step_dict['Z_probability_label1']
    
    trace = go.Heatmap(x=xx[0], y=y_, z=Z,
                       showscale=True,
                       visible=False)
    
    fig.add_trace(trace)

    trace = go.Scatter(x=X[:, 0], y=X[:, 1], 
                       mode='markers',
                       showlegend=False,
                       marker=dict(size=10,color=y,line=dict(color='black', width=1)),
                       visible=False)

    fig.add_trace(trace)
        
# Make 10th trace visible
fig.data[19].visible = True # (because we add two traces per plot we need to multiply by two and make both visible)
fig.data[20].visible = True

In [None]:
# Show plot
fig.update_layout(   
    sliders=create_sliders_2traces(df_collection),
    xaxis=dict(title='Component 1'),
    yaxis=dict(title='Component 2'))

fig.show()