In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew, boxcox, yeojohnson
from datetime import datetime
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import zscore 


In [2]:
df_original = pd.read_csv('../Data/digital_marketing_campaign_dataset.csv')

In [3]:
def process_data(df_original):
    """
    Processes the dataset by:
    1. Dropping unnecessary columns
    2. Encoding categorical features using one-hot encoding
    3. Standardizing numerical features

    Returns:
    - df_scaled (numpy array): Standardized dataset
    - df_encoded (DataFrame): Encoded dataset before scaling (for column reference)
    - scaler (StandardScaler): Fitted scaler for inverse transformation
    """
    # Drop unwanted columns
    df_drop = df_original.drop(columns=['AdvertisingPlatform', 'AdvertisingTool', 'CustomerID'])

    # Encode categorical variables
    columns_to_encode = ['Gender', 'CampaignChannel', 'CampaignType']
    df_encoded = pd.get_dummies(df_drop, columns=columns_to_encode, drop_first=False)

    # Standardize the data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_encoded)

    return df_scaled, df_encoded, scaler

In [4]:
df_scaled, df_encoded, scaler = process_data(df_original)

In [5]:

def find_best_pca_kmeans(df_scaled, max_pca=10, k_min=2, k_max=15):
    """
    Finds the optimal number of PCA components and K-Means clusters 
    using silhouette scores as the evaluation metric.

    Parameters:
    - df_scaled (numpy array): Standardized dataset
    - max_pca (int): Maximum number of PCA components to consider (default: 10)
    - k_min (int): Minimum number of clusters to test (default: 2)
    - k_max (int): Maximum number of clusters to test (default: 15)

    Returns:
    - best_pca_n (int): Best number of PCA components
    - best_k_for_best (int): Best number of clusters
    - best_scores_for_best (list): Silhouette scores for different k-values
    """
    # Define PCA and KMeans search ranges
    pca_range = range(2, min(df_scaled.shape[1], max_pca) + 1)
    k_range = range(k_min, k_max)
    
    overall_best_sil = -np.inf
    best_pca_n = None
    best_k_for_best = None
    best_scores_for_best = None

    # Loop over different numbers of PCA components
    for n_components in pca_range:
        pca = PCA(n_components=n_components, random_state=42)
        X_pca_temp = pca.fit_transform(df_scaled)
        
        scores_temp = []
        for k in k_range:
            kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
            labels_temp = kmeans_temp.fit_predict(X_pca_temp)
            sil = silhouette_score(X_pca_temp, labels_temp)
            scores_temp.append((k, sil))

        best_for_this = max(scores_temp, key=lambda x: x[1])

        if best_for_this[1] > overall_best_sil:
            overall_best_sil = best_for_this[1]
            best_pca_n = n_components
            best_k_for_best = best_for_this[0]
            best_scores_for_best = scores_temp

    return best_pca_n, best_k_for_best, best_scores_for_best

In [6]:
best_pca_n, best_k_for_best, best_scores_for_best = find_best_pca_kmeans(df_scaled, max_pca=10, k_min=2, k_max=15)

In [7]:
def apply_pca_kmeans(df_scaled, best_pca_n, best_k_for_best):
    """
    Applies PCA with the optimal number of components and performs K-Means clustering.

    Parameters:
    - df_scaled (numpy array): Standardized dataset
    - best_pca_n (int): Best number of PCA components
    - best_k_for_best (int): Best number of clusters

    Returns:
    - df_pca (numpy array): PCA-transformed dataset
    - labels (numpy array): Cluster labels from K-Means
    - kmeans (KMeans object): Fitted K-Means model
    - pca_best (PCA object): Fitted PCA model
    """
    pca_best = PCA(n_components=best_pca_n, random_state=42)
    df_pca = pca_best.fit_transform(df_scaled)

    kmeans = KMeans(n_clusters=best_k_for_best, random_state=42, n_init=10)
    labels = kmeans.fit_predict(df_pca)

    return df_pca, labels, kmeans, pca_best

In [8]:
df_pca, labels, kmeans, pca_best = apply_pca_kmeans(df_scaled, best_pca_n, best_k_for_best)

In [9]:
labels

array([3, 2, 3, ..., 3, 3, 3], dtype=int32)