## 0. Packages and Datasets Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew, boxcox, yeojohnson
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import zscore 


In [3]:
df = pd.read_csv('../Data/digital_marketing_campaign_dataset.csv')

In [4]:
df_original = pd.read_csv('../Data/digital_marketing_campaign_dataset.csv')

In [5]:
df_drop = df.drop(columns = ['AdvertisingPlatform', 'AdvertisingTool','CustomerID'])

In [6]:
df = df_drop

In [7]:
columns_to_encode = ['Gender', 'CampaignChannel', 'CampaignType']
df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=False)
df = df_encoded

In [8]:
scalar = StandardScaler()
df_scaled = scalar.fit_transform(df)

In [9]:
pca_range = range(2, min(df_scaled.shape[1], 10) + 1)  # from 2 to either 10 or number of features

# Define k-range for K-Means
k_range = range(2, 11)

# Initialize variables to store the best configuration
overall_best_sil = -np.inf
best_pca_n = None
best_k_for_best = None
best_scores_for_best = None

In [10]:
# Loop over different numbers of PCA components
for n_components in pca_range:
    # Apply PCA with n_components
    pca = PCA(n_components=n_components, random_state=42)
    X_pca_temp = pca.fit_transform(df_scaled)
    
    # Test K-Means for this PCA-transformed data
    scores_temp = []
    for k in k_range:
        kmeans_temp = KMeans(n_clusters=k, random_state=42)
        labels_temp = kmeans_temp.fit_predict(X_pca_temp)
        sil = silhouette_score(X_pca_temp, labels_temp)
        scores_temp.append((k, sil))
    
    # Find the best silhouette score for this number of PCA components
    best_for_this = max(scores_temp, key=lambda x: x[1])
    
    # Check if this configuration is better than the overall best
    if best_for_this[1] > overall_best_sil:
        overall_best_sil = best_for_this[1]
        best_pca_n = n_components
        best_k_for_best = best_for_this[0]
        best_scores_for_best = scores_temp

In [11]:
# Print the best configuration found
print("Best number of PCA components:", best_pca_n)
print("Best k (clusters) for that PCA configuration:", best_k_for_best)
print("Overall best silhouette score:", overall_best_sil)

Best number of PCA components: 2
Best k (clusters) for that PCA configuration: 6
Overall best silhouette score: 0.6917106071593926


In [12]:

pca_best = PCA(n_components=best_pca_n, random_state=42)
df_pca_best = pca_best.fit_transform(df_scaled)

In [13]:
pca = PCA(n_components=2, random_state=42)
df_pca = pca.fit_transform(df_scaled)

print("Explained variance by 2 PCA components:", pca.explained_variance_ratio_)

Explained variance by 2 PCA components: [0.08034666 0.05788138]


In [14]:
# Test K-Means silhouette scores on the best PCA configuration
scores_best = []
for k in k_range:
    kmeans_best = KMeans(n_clusters=k, random_state=42)
    labels_best = kmeans_best.fit_predict(df_pca_best)
    sil_best = silhouette_score(df_pca_best, labels_best)
    scores_best.append((k, sil_best))

In [17]:
range_n_clusters = range(2, 10)
silhouette_scores = {}
inertia_values = {}

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(df_pca)
    sil_score = silhouette_score(df_pca, labels)
    inertia = kmeans.inertia_
    
    silhouette_scores[n_clusters] = sil_score
    inertia_values[n_clusters] = inertia
    
    print(f"Clusters: {n_clusters}, Silhouette Score: {sil_score:.4f}, Inertia: {inertia:.4f}")

Clusters: 2, Silhouette Score: 0.6042, Inertia: 11657.0408
Clusters: 3, Silhouette Score: 0.5942, Inertia: 7247.3669
Clusters: 4, Silhouette Score: 0.6018, Inertia: 4592.8428
Clusters: 5, Silhouette Score: 0.6518, Inertia: 2657.7026
Clusters: 6, Silhouette Score: 0.6917, Inertia: 1437.0185
Clusters: 7, Silhouette Score: 0.6138, Inertia: 1078.5732
Clusters: 8, Silhouette Score: 0.5596, Inertia: 846.2990
Clusters: 9, Silhouette Score: 0.5353, Inertia: 703.9258


In [None]:


# Set the number of clusters to 6
kmeans = KMeans(n_clusters=6, random_state=42)

# Fit KMeans and predict cluster labels
labels = kmeans.fit_predict(df_pca)  

# Add the cluster labels to the original dataframe




In [19]:

# 1. Set number of clusters explicitly
n_clusters = 6

# 2. Verify your KMeans was fitted with 6 clusters
assert kmeans.n_clusters == n_clusters, "KMeans was not fitted with 6 clusters!"

# 3. Get centroids in PCA space (shape: [6, n_components])
pca_centroids = kmeans.cluster_centers_

# 4. Inverse transform to standardized feature space
centroids_standardized = pca.inverse_transform(pca_centroids)

# 5. Verify dimensions before proceeding
n_features = len(df.columns)
assert centroids_standardized.shape == (n_clusters, n_features), \
    f"Dimension mismatch! Expected {(n_clusters, n_features)}, got {centroids_standardized.shape}"

# 6. Reverse standardization
centroids_original = centroids_standardized * scalar.scale_ + scalar.mean_

# 7. Create DataFrame with proper validation
centroids_original_df = pd.DataFrame(
    centroids_original,
    columns=df.columns
)

# 8. Add cluster labels (0-5 for 6 clusters)
centroids_original_df.insert(0, 'Cluster', range(n_clusters))

# 9. Display results
print("Final Cluster Centroids in Original Feature Space:")
print(centroids_original_df.round(2))

Final Cluster Centroids in Original Feature Space:
   Cluster    Age    Income  AdSpend  ClickThroughRate  ConversionRate  \
0        0  44.10  82281.07  5610.29              0.17            0.12   
1        1  42.66  85308.10  4123.82              0.13            0.09   
2        2  43.14  80725.35  4917.23              0.15            0.10   
3        3  43.73  86868.75  4906.33              0.15            0.10   
4        4  44.70  88427.35  5600.26              0.17            0.11   
5        5  42.08  79152.35  4138.82              0.13            0.09   

   WebsiteVisits  PagesPerVisit  TimeOnSite  SocialShares  ...  Gender_Male  \
0          26.05           5.99        8.80         48.45  ...         0.99   
1          22.98           4.92        6.10         51.93  ...        -0.01   
2          24.73           5.51        7.45         50.28  ...         1.00   
3          24.48           5.47        7.62         49.88  ...         0.01   
4          25.81           5.96    

In [22]:
df['Cluster_Label'] = labels  