In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

EDA

In [3]:
data = pd.read_csv('./Global_Superstore2.csv', encoding='latin-1')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51290 entries, 0 to 51289
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Row ID          51290 non-null  int64  
 1   Order ID        51290 non-null  object 
 2   Order Date      51290 non-null  object 
 3   Ship Date       51290 non-null  object 
 4   Ship Mode       51290 non-null  object 
 5   Customer ID     51290 non-null  object 
 6   Customer Name   51290 non-null  object 
 7   Segment         51290 non-null  object 
 8   City            51290 non-null  object 
 9   State           51290 non-null  object 
 10  Country         51290 non-null  object 
 11  Postal Code     9994 non-null   float64
 12  Market          51290 non-null  object 
 13  Region          51290 non-null  object 
 14  Product ID      51290 non-null  object 
 15  Category        51290 non-null  object 
 16  Sub-Category    51290 non-null  object 
 17  Product Name    51290 non-null 

In [7]:
data.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
0,32298,CA-2012-124891,31-07-2012,31-07-2012,Same Day,RH-19495,Rick Hansen,Consumer,New York City,New York,...,TEC-AC-10003033,Technology,Accessories,Plantronics CS510 - Over-the-Head monaural Wir...,2309.65,7,0.0,762.1845,933.57,Critical
1,26341,IN-2013-77878,05-02-2013,07-02-2013,Second Class,JR-16210,Justin Ritter,Corporate,Wollongong,New South Wales,...,FUR-CH-10003950,Furniture,Chairs,"Novimex Executive Leather Armchair, Black",3709.395,9,0.1,-288.765,923.63,Critical
2,25330,IN-2013-71249,17-10-2013,18-10-2013,First Class,CR-12730,Craig Reiter,Consumer,Brisbane,Queensland,...,TEC-PH-10004664,Technology,Phones,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.971,915.49,Medium
3,13524,ES-2013-1579342,28-01-2013,30-01-2013,First Class,KM-16375,Katherine Murray,Home Office,Berlin,Berlin,...,TEC-PH-10004583,Technology,Phones,"Motorola Smart Phone, Cordless",2892.51,5,0.1,-96.54,910.16,Medium
4,47221,SG-2013-4320,05-11-2013,06-11-2013,Same Day,RH-9495,Rick Hansen,Consumer,Dakar,Dakar,...,TEC-SHA-10000501,Technology,Copiers,"Sharp Wireless Fax, High-Speed",2832.96,8,0.0,311.52,903.04,Critical


In [None]:
class CustomerSegmentation:
    def __init__(self, data, features, clustering_method='kmeans', n_clusters=4, **kwargs):
        """
        Initialize the segmentation pipeline.
        :param data: Pandas DataFrame containing customer data.
        :param features: List of feature column names to use for segmentation.
        :param clustering_method: Clustering algorithm to use ('kmeans' or 'dbscan').
        :param n_clusters: Default number of clusters (used for K-Means).
        :param kwargs: Additional parameters for clustering algorithms.
        """
        self.data = data
        self.features = features
        self.method = clustering_method
        self.n_clusters = n_clusters
        self.kwargs = kwargs
        self.model = None
        self.scaled_features = None

    def preprocess_data(self):
        """
        Scale features for clustering.
        :return: Scaled feature array.
        """
        scaler = StandardScaler()
        self.scaled_features = scaler.fit_transform(self.data[self.features])
        return self.scaled_features

    def apply_clustering(self):
        """
        Apply the chosen clustering algorithm.
        """
        if self.method == 'kmeans':
            self.model = KMeans(n_clusters=self.n_clusters, random_state=42, **self.kwargs)
        elif self.method == 'dbscan':
            self.model = DBSCAN(**self.kwargs)
        else:
            raise ValueError("Unsupported clustering method. Choose 'kmeans' or 'dbscan'.")
        
        self.data['Segment'] = self.model.fit_predict(self.scaled_features)

    def evaluate_clustering(self):
        """
        Evaluate clustering using silhouette score (for methods where applicable).
        :return: Silhouette score (if applicable).
        """
        if hasattr(self.model, 'labels_'):
            labels = self.model.labels_
        else:
            labels = self.data['Segment']

        if len(np.unique(labels)) > 1:
            score = silhouette_score(self.scaled_features, labels)
            return f"Silhouette Score: {score:.3f}"
        else:
            return "Silhouette Score not applicable for single cluster."

    def visualize_clusters(self, reduce_dim=True):
        """
        Visualize clusters in 2D using PCA or raw features.
        :param reduce_dim: Whether to use PCA for dimensionality reduction.
        """
        if reduce_dim:
            pca = PCA(n_components=2)
            reduced_features = pca.fit_transform(self.scaled_features)
        else:
            reduced_features = self.scaled_features[:, :2]

        plt.figure(figsize=(8, 6))
        sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=self.data['Segment'], palette='Set2')
        plt.title('Customer Segments')
        plt.xlabel('Dimension 1')
        plt.ylabel('Dimension 2')
        plt.legend(title='Segment', loc='best')
        plt.show()

    def find_optimal_clusters(self, max_clusters=10):
        """
        Automates optimal cluster selection for K-Means using the Elbow Method and Silhouette Score.
        :param max_clusters: Maximum number of clusters to evaluate.
        :return: Optimal number of clusters based on silhouette score.
        """
        if self.method != 'kmeans':
            raise ValueError("Optimal cluster search is only supported for K-Means.")
        
        inertias = []
        silhouette_scores = []

        for k in range(2, max_clusters + 1):
            kmeans = KMeans(n_clusters=k, random_state=42)
            labels = kmeans.fit_predict(self.scaled_features)
            inertias.append(kmeans.inertia_)
            silhouette_scores.append(silhouette_score(self.scaled_features, labels))

        # Plot results
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        plt.plot(range(2, max_clusters + 1), inertias, marker='o')
        plt.title("Elbow Method")
        plt.xlabel("Number of Clusters")
        plt.ylabel("Inertia")

        plt.subplot(1, 2, 2)
        plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o')
        plt.title("Silhouette Scores")
        plt.xlabel("Number of Clusters")
        plt.ylabel("Score")

        plt.tight_layout()
        plt.show()

        # Return the optimal number based on silhouette scores
        optimal_k = np.argmax(silhouette_scores) + 2
        print(f"Optimal number of clusters: {optimal_k}")
        return optimal_k
