## 0. Packages and Datasets Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew, boxcox, yeojohnson
from datetime import datetime
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import zscore 


In [2]:

df_original = pd.read_csv('../Data/digital_marketing_campaign_dataset.csv')
df_drop = df_original.drop(columns = ['AdvertisingPlatform', 'AdvertisingTool','CustomerID'])
columns_to_encode = ['Gender', 'CampaignChannel', 'CampaignType']
df_encoded = pd.get_dummies(df_drop, columns=columns_to_encode, drop_first=False)
scalar = StandardScaler()
df_scaled = scalar.fit_transform(df_encoded)


In [3]:
pca_range = range(2, min(df_scaled.shape[1], 10) + 1)  # from 2 to either 10 or number of features

k_range = range(2, 15)
overall_best_sil = -np.inf
best_pca_n = None
best_k_for_best = None
best_scores_for_best = None
# Loop over different numbers of PCA components
for n_components in pca_range:
    # Apply PCA with n_components
    pca = PCA(n_components=n_components, random_state=42)
    X_pca_temp = pca.fit_transform(df_scaled)
    
    # Test K-Means for this PCA-transformed data
    scores_temp = []
    for k in k_range:
        kmeans_temp = KMeans(n_clusters=k, random_state=42)
        labels_temp = kmeans_temp.fit_predict(X_pca_temp)
        sil = silhouette_score(X_pca_temp, labels_temp)
        scores_temp.append((k, sil))
    
    # Find the best silhouette score for this number of PCA components
    best_for_this = max(scores_temp, key=lambda x: x[1])
    
    # Check if this configuration is better than the overall best
    if best_for_this[1] > overall_best_sil:
        overall_best_sil = best_for_this[1]
        best_pca_n = n_components
        best_k_for_best = best_for_this[0]
        best_scores_for_best = scores_temp

In [4]:
# Print the best configuration found
print("Best number of PCA components:", best_pca_n)
print("Explained variance by 2 PCA components:", pca.explained_variance_ratio_)
print("Best k (clusters) for that PCA configuration:", best_k_for_best)
print("Overall best silhouette score:", overall_best_sil)

Best number of PCA components: 2
Explained variance by 2 PCA components: [0.08034666 0.05788138 0.05388244 0.05345779 0.05144862 0.0508823
 0.05030133 0.04954703 0.04887433 0.04279948]
Best k (clusters) for that PCA configuration: 6
Overall best silhouette score: 0.6917106071593926


In [5]:
pca_best = PCA(n_components=best_pca_n, random_state=42)
df_pca = pca_best.fit_transform(df_scaled)
kmeans = KMeans(n_clusters=best_k_for_best, random_state=42)
labels = kmeans.fit_predict(df_pca)  

In [6]:
# 1. Get centroids in PCA-transformed space
pca_centroids = kmeans.cluster_centers_

# 2. Inverse transform using the correct PCA model
centroids_standardized = pca_best.inverse_transform(pca_centroids)  # Use pca_best, not pca

# 3. Reverse standardization to map back to original feature space
centroids_original = centroids_standardized * scalar.scale_ + scalar.mean_

# 4. Create DataFrame for better readability
centroids_df = pd.DataFrame(centroids_original, columns=df_encoded.columns)

# 5. Add cluster labels (0, 1, ..., k-1)
centroids_df.insert(0, 'Cluster', range(best_k_for_best))

# 6. Display results
print("Final Cluster Centroids in Original Feature Space:")
print(centroids_df.round(2))


Final Cluster Centroids in Original Feature Space:
   Cluster    Age    Income  AdSpend  ClickThroughRate  ConversionRate  \
0        0  44.10  82281.07  5610.29              0.17            0.12   
1        1  42.66  85308.10  4123.82              0.13            0.09   
2        2  43.14  80725.35  4917.23              0.15            0.10   
3        3  43.73  86868.75  4906.33              0.15            0.10   
4        4  44.70  88427.35  5600.26              0.17            0.11   
5        5  42.08  79152.35  4138.82              0.13            0.09   

   WebsiteVisits  PagesPerVisit  TimeOnSite  SocialShares  ...  Gender_Male  \
0          26.05           5.99        8.80         48.45  ...         0.99   
1          22.98           4.92        6.10         51.93  ...        -0.01   
2          24.73           5.51        7.45         50.28  ...         1.00   
3          24.48           5.47        7.62         49.88  ...         0.01   
4          25.81           5.96    