In [9]:
# Import the required libraries and dependencies
import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


In [10]:
# Load data into pandas
file_path = Path("..//02-PCA/shopping_data_transformed.csv")
df_shopping = pd.read_csv(file_path)
df_shopping.head()


Unnamed: 0,Age,Annual Income,Spending Score,Credit,Debit
0,-1.424569,-1.738999,-0.434801,1,0
1,-1.281035,-1.738999,1.195704,1,0
2,-1.352802,-1.70083,-1.715913,0,1
3,-1.137502,-1.70083,1.040418,0,1
4,-0.563369,-1.66266,-0.39598,0,1


In [11]:
pca = PCA(n_components=2)


In [12]:
# Reduce from five to two features using PCA
shopping_pca_data = pca.fit_transform(df_shopping)

# Display sample data
shopping_pca_data[:5]


array([[-0.57572361, -1.68487363],
       [-1.62359494, -1.72648138],
       [ 0.27961435, -1.72531425],
       [-1.51106396, -1.79620462],
       [-0.09546842, -1.7073466 ]])

In [13]:
# Retrieve the explained variance
pca.explained_variance_ratio_


array([0.38132566, 0.28707805])

In [14]:
# Creating a DataFrame with the PCA data
df_shopping_pca = pd.DataFrame(
    shopping_pca_data,
    columns=["PC1", "PC2"])

# Display sample data
df_shopping_pca.head()


Unnamed: 0,PC1,PC2
0,-0.575724,-1.684874
1,-1.623595,-1.726481
2,0.279614,-1.725314
3,-1.511064,-1.796205
4,-0.095468,-1.707347


In [15]:
#Initialize the K-Means model
model = KMeans(n_clusters=4)

#Fit the model
model.fit(df_shopping_pca)

#Predict clusters
customer_segments = model.predict(df_shopping_pca)

#Create a copy of the Original DataFrame
df_shopping_pca_predictions = df_shopping_pca.copy()

#Create a new column to hold the predicted clusters
df_shopping_pca_predictions["Customer Segment"] = customer_segments

#Display sample data
df_shopping_pca_predictions.head()


Unnamed: 0,PC1,PC2,Customer Segment
0,-0.575724,-1.684874,3
1,-1.623595,-1.726481,3
2,0.279614,-1.725314,2
3,-1.511064,-1.796205,3
4,-0.095468,-1.707347,3


In [16]:
#Create a scatter plot of the PCA data and color it according to the K-Means model predictions
df_shopping_pca_predictions.hvplot.scatter(
    x="PC2",
    y="PC1",
    by="Customer Segment",
)
