In [2]:
# Import the libraries and dependencies:
import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
# APPLYING THE PCA TECHNIQUE
# Usually in finance, we deal with datasets containing several columns that describe different features of the data.
# Dealing with too many features can slow down an algorithm's execution, especially when we work with large amounts of data.
# PRINCIPLE COMPONENT ANALYSIS (PCA) is a statistical technique that we use to speed up machine learning algorithms when too many features, or dimensions, exist.
# Depending on the analysis, "too many" can be ten, a hundred, or even thousands.
# PCA reduces the number of dimensions by transforming a large set of features into a smaller one that contains most of the information in the original, large set.
# This technique increases interpretability and minimizes information loss.

In [4]:
# NORMALIZE AND TRANSFORM THE DATA
# Before we start applying PCA, we need to normalize and transform our data into numerical values.
# As mentioned ealier, we can do this manually or by using `StandardScaler` and `get_dummies` as needed.
# For this example, we'll use the preprocessed shopping data that we created earlier.
# First, we'll lad that data into a DataFrame:
df_shopping = pd.read_csv(
    Path('shopping_data_transformed.csv'),
)

# Display the first and last 5 rows of the DataFrame:
display(df_shopping.head())
display(df_shopping.tail())

Unnamed: 0,Age,Annual Income,Spending Score,Credit,Debit
0,-1.424569,-1.738999,-0.434801,1,0
1,-1.281035,-1.738999,1.195704,1,0
2,-1.352802,-1.70083,-1.715913,0,1
3,-1.137502,-1.70083,1.040418,0,1
4,-0.563369,-1.66266,-0.39598,0,1


Unnamed: 0,Age,Annual Income,Spending Score,Credit,Debit
195,-0.276302,2.268791,1.118061,0,1
196,0.441365,2.497807,-0.861839,0,1
197,-0.491602,2.497807,0.923953,1,0
198,-0.491602,2.917671,-1.250054,1,0
199,-0.635135,2.917671,1.273347,1,0


In [5]:
# As you can see, this DataFrame has five features:
    # 1. Age
    # 2. Annual Income
    # 3. Spending Score
    # 4. Credit
    # 5. Debit
# With PCA, we can reduce the number of features to increase the interpretability of any plots that we create.
# We cn also potentially reduce the processing time of a machine learning algorithm.

In [6]:
# REDUCE THE NUMBER OF FEATURES
# For this example, we want to reduce the number of features for two purposes:
    # 1. To optimize the identification of clusters when we use the K-means algorithm.
    # 2. To ease the visualization of the clusters.
# Let's use PCA to reduce the number of features from five to two.
# First, we need to create a `PCA` model instance and pass it a parameter that specifies a number of the features we want.
# We'll use `n_components=2` as the pararmeter:
pca = PCA(n_components=2)

In [8]:
# APPLY DIMENSIONALITY REDUCTION
# After creating the PCA model, we apply dimensionality reduction on the preprocessed dataset.
# DIMENSIONALITY REDUCTION means reducing the number of columns in a DataFrame yet preserving as much useful information as possible from all the original columns.

# Reduce from five to two features using PCA:
shopping_pca_data = pca.fit_transform(df_shopping)

# Display the sample data:
print(shopping_pca_data[:5])

[[-0.57572361 -1.68487363]
 [-1.62359494 -1.72648138]
 [ 0.27961435 -1.72531425]
 [-1.51106396 -1.79620462]
 [-0.09546842 -1.7073466 ]]


In [9]:
# MEASURE THE AMOUNT OF VARIANCE IN THE PRINCIPLE COMPONENTS
# After dimensionality reduction, we have a smaller set of dimensions called PRINCIPLE COMPONENTS.
# Each principle component has no particular meaning assigned to it. 
# With these new components, we were able to reduce the number of variables of the dataset while preserving as much of the information from the original dataset as possible.
# We now have two variables, or principle components.

# NOTE:
# Dimensionality reduction implies a loss of accuracy.
# However, the trick is to sacrifice a bit of accuracy for simplicity.
# We can more easily explore and visualize smaller datasets.
# They ease data analysis and speed up machine learning algorithms without the extra variables in the process.

# Data has variability.
# And because data ranges up and down, that variation itself contains useful information.
# We can find out how much of that information each principle component contains by examining the explained variance of a principle component.
# EXPLAINED VARIANCE is the amount of variability in the data that the PCA module has condensed into a single principle component.
# We can measure the relative amount of information that one principle component contains compared to another by examining the proportion of explained variance.
# To do that, we get the `explained_variance_ratio_` attribute from the `PCA` model:

# Retrieve the explained variance:
pca.explained_variance_ratio_

array([0.38132566, 0.28707805])

In [10]:
# After getting the `explained_variance_ratio_` attribute, we can observe that the first principle component contains 38.13% of the variance.
# The second principle component contains 28.71% of the variance.
# Both components together thus contain 66.84% of the original information.

In [14]:
# USE THE PRINCIPLE COMPONENTS TO FIND CLUSTERS
# Once we have the principle components, we can create a DataFrame and use it to find clusters using the K-means algorithm.
# We start by converting the array of values resulting from our PCA analysis into a DataFrame:
df_shopping_pca = pd.DataFrame(
    shopping_pca_data,
    columns=['PC1', 'PC2']
)

# Display sample data:
df_shopping_pca.head()

Unnamed: 0,PC1,PC2
0,-0.575724,-1.684874
1,-1.623595,-1.726481
2,0.279614,-1.725314
3,-1.511064,-1.796205
4,-0.095468,-1.707347


In [15]:
# Now, we can call the K-Means algorithm to define our customer segments

# Initialize the K-Means model:
model = KMeans(n_clusters=4)

# Fit the model:
model.fit(df_shopping_pca)

# Predict the clusters:
customer_segments = model.predict(df_shopping_pca)

# Create a copy of the original DataFrame:
df_shopping_pca_predictions = df_shopping_pca.copy()

# Create a new column in the DataFrame with the predicted clusters:
df_shopping_pca_predictions['Customer Segments'] = customer_segments

# Display sample data:
df_shopping_pca_predictions.head()

Unnamed: 0,PC1,PC2,Customer Segments
0,-0.575724,-1.684874,2
1,-1.623595,-1.726481,2
2,0.279614,-1.725314,3
3,-1.511064,-1.796205,2
4,-0.095468,-1.707347,2


In [16]:
# The resulting output is a DataFrame with three columns:
    # 1. Values associated with the first principle component.
    # 2. Values associated with the second principle component.
    # 3. The customer segment numbers.
# Given theat the `n_clusters` value was assigned as 4, the resulting customer segment numbers range from 0 to 3.
# Each row applies to a transaction in the original DataFrame.
# Note that we ran the K-means algorithm on the PCA data without any issues.
# The difference from the previous execution of the algorithm in Lesson2, which used the full number of dimensions (or columns), is that this time, we used the principle componenets to find patterns among the data.

In [18]:
# VISUALIZE THE CLUSTERS
# Let's create a scatter plot to visualize the customer segments:

# Create a scatter plot to visualize the clusters:
df_shopping_pca_predictions.hvplot.scatter(
    x='PC2',
    y='PC1',
    by='Customer Segments'
)

In [None]:
# Despite some accuracy loss because of dimensionality reduction, the plot groups the four customer segments.
# However, they're a little easier to visualize this time, as compared to the plot generated from the K-means algorithm in Lesson 2.
# This is because we now have only two features.
# The first feature, or principle component (PC1) runs along the y-axis, and the second feature (PC2) runs along the x-axis.
# We just walked through an example of dimensionality reduction by apllying a technique called PCA.
# This was useful because it allowed us to shrink the number of variables used in the K-Means algorithm, yet without sacrificing much in terms of accuracy.