# Demo: Principal Component Analysis (PCA)

In [None]:
# Required imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Load the Data Into a Pandas DataFrame

In [None]:
# Read in the CSV file as a Pandas Dataframe
ccinfo_default_df = pd.read_csv(
    Path("../Resources/ccinfo_transformed.csv")
)

ccinfo_default_df.head()

In [None]:
# Plot the clusters by using the "limit_bal" and "age" columns
ccinfo_default_df.hvplot.scatter(
    x="limit_bal",
    y="age",
    by="customer_segments"
)

In [None]:
# Plot the clusters by using the "bill_amt" and "pay_amt" columns
ccinfo_default_df.hvplot.scatter(
    x="bill_amt",
    y="pay_amt",
    by="customer_segments"
)

## Use PCA to reduce the number of factors 

In [None]:
# Import the PCA module


In [None]:
# Instantiate the PCA instance and declare the number of PCA variables


In [None]:
# Fit the PCA model on the transformed credit card DataFrame


# Review the first 5 rows of list data


## PCA explained variance ratio

In [None]:
# Calculate the PCA explained variance ratio


## Creating the PCA DataFrame

In [None]:
# Create the PCA DataFrame


# Review the PCA DataFrame


## Incorporating the PCA DataFrame into the elbow method

In [None]:
# Create a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(ccinfo_pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

In [None]:
# Plot the Elbow Curve
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

## Segmention of the PCA data with Kmeans 

In [None]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(ccinfo_pca_df)

# Make predictions
k_3 = model.predict(ccinfo_pca_df)

# Create a copy of the PCA DataFrame
ccinfo_pca_predictions_df = ccinfo_pca_df.copy()

# Add a class column with the labels
ccinfo_pca_predictions_df["customer_segments"] = k_3

In [None]:
# Plot the clusters
ccinfo_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="customer_segments"
)