## The Ultimate Step-by-Step Guide to Data Mining with PCA and KMeans

Pre-processing: Preparing your dataset for analysis.
Scaling: Why and how to scale your data.
Optimal PCA Components: Determining the right number of components.
Applying PCA: Transforming your data.
KMeans Clustering: Grouping the transformed data.
Analyzing PCA Loadings: Understanding what your components represent.
From PCA Space to Original Space: Interpreting the cluster centers.
Centroids and Means: Comparing cluster centers with the original data mean.
Deep Dive into Loadings: A closer look at the features influencing each principal component.

In [None]:
# A. Cleaning the Data
df = df.fillna(df.mean())
df.drop(['unnecessary_column'], axis=1, inplace=True)

In [None]:
# B. Encoding Categorical Data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['category_column'] = label_encoder.fit_transform(df['category_column'])

In [None]:
df = pd.get_dummies(df, columns=['nominal_category_column'])

In [1]:
# Step 2: Scaling — Why and How to Scale Your Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
scaled_features = min_max_scaler.fit_transform(df)

In [None]:
# Step 3: Optimal PCA Components — Determining the Right Number of Components

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Fit PCA on scaled data
pca = PCA().fit(scaled_features)

# Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Different Principal Components')
plt.show()

In [None]:
n_components = # number identified from the plot
pca = PCA(n_components=n_components)

In [None]:
# Step 4: Applying PCA — Transforming Your Data

In [None]:
from sklearn.decomposition import PCA

# Applying PCA with the optimal number of components
pca = PCA(n_components=n_components)
pca_result = pca.fit_transform(scaled_features)

# The pca_result is an array with the transformed features

In [None]:
import matplotlib.pyplot as plt

# For a 2D plot (if n_components=2)
plt.scatter(pca_result[:, 0], pca_result[:, 1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Results')
plt.show()

# For a 3D plot (if n_components=3)
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca_result[:, 0], pca_result[:, 1], pca_result[:, 2])
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.title('3D PCA Results')
plt.show()

In [None]:
# Step 5: KMeans Clustering — Grouping the Transformed Data

In [None]:
from sklearn.cluster import KMeans

# Determine the number of clusters
# (This number can be determined based on domain knowledge, heuristics, or methods like the Elbow Method)
n_clusters = # appropriate number of clusters

# Applying KMeans clustering
kmeans_pca = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_pca.fit(pca_result)

# The cluster labels for each data point
cluster_labels = kmeans_pca.labels_

In [None]:
# 2D Visualization
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=cluster_labels)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('KMeans Clustering on PCA Results')
plt.show()

# 3D Visualization
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca_result[:, 0], pca_result[:, 1], pca_result[:, 2], c=cluster_labels)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.title('3D KMeans Clustering on PCA Results')
plt.show()

In [None]:
# Step 6: Analyzing PCA Loadings — Understanding What Your Components Represent

In [None]:
# Get the PCA components (loadings)
pca_components = pca.components_

# Create a DataFrame for better visualization and analysis
pca_loadings_df = pd.DataFrame(pca_components, columns=df.columns, index=[f'PC{i+1}' for i in range(pca.n_components)])

# Display the loadings
pca_loadings_df

In [None]:
import seaborn as sns

# Heatmap of the loadings
plt.figure(figsize=(12, 6))
sns.heatmap(pca_loadings_df, cmap="YlGnBu", annot=True)
plt.title('PCA Loadings')
plt.show()

In [None]:
# Step 7: From PCA Space to Original Space — Interpreting the Cluster Centers

In [None]:
# Inverse transform the cluster centers
original_space_centroids = scaler.inverse_transform(pca.inverse_transform(kmeans_pca.cluster_centers_))

# Create a DataFrame for the inverse transformed cluster centers
centroids_df = pd.DataFrame(original_space_centroids, columns=df.columns)

# Display the centroids
centroids_df

In [None]:
# Calculate the mean of the original data
original_means = df.mean(axis=0)

# Append the mean to the centroids for comparison
centroids_comparison_df = centroids_df.append(original_means, ignore_index=True)

# Display the comparison
centroids_comparison_df

In [None]:
# Step 8: Centroids and Means — Comparing Cluster Centers with the Original Data Mean

In [None]:
# Calculate the mean of the original data
original_means = df.mean(axis=0)

# Append the mean to the centroids for comparison
centroids_comparison_df = centroids_df.append(original_means, ignore_index=True)

# Display the comparison
centroids_comparison_df

In [None]:
import matplotlib.pyplot as plt

# Visualizing the comparison
centroids_comparison_df.plot(kind='bar', figsize=(15, 6))
plt.title('Comparison of Cluster Centroids with the Dataset Mean')
plt.xlabel('Cluster / Mean')
plt.ylabel('Feature Values')
plt.show()

In [None]:
#Step 9: Deep Dive into Loadings — A Closer Look at the Features Influencing Each Principal Component

In [None]:
import numpy as np

# Extract the absolute values of the loadings
pca_loadings_analysis = pd.DataFrame(
    np.abs(pca.components_), 
    columns=df.columns, 
    index=[f'PC{i+1}' for i in range(pca.n_components)]
)

# Identify the top contributing features for each principal component
top_features_per_pc = pca_loadings_analysis.apply(lambda s: s.nlargest(5).index.tolist(), axis=1)

top_features_per_pc

In [None]:
# Function to get top features and their loadings
def get_top_features_loadings(pca_loadings, n_features):
    top_features = {}
    for i in range(pca_loadings.shape[0]):
        top_indices = np.argsort(np.abs(pca_loadings[i]))[-n_features:]
        top_features[f'PC{i+1}'] = {df.columns[j]: pca_loadings[i][j] for j in top_indices}
    return top_features

# Get and display the top features and their loadings
top_features_loadings = get_top_features_loadings(pca.components_, 5)
top_features_loadings_df = pd.DataFrame(top_features_loadings).T

top_features_loadings_df