In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score


df = pd.read_excel('data (2).xlsx')

print(df.head())

print(df.info())

print(df.describe())

In [None]:
print(df.dtypes)

df['Customer ID'] = df['Customer ID'].fillna(0).astype(int)

df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')  # 'coerce' will set invalid parsing as NaT

df['Country'] = df['Country'].astype('category')

print(df.dtypes)


In [None]:
import pandas as pd
from datetime import datetime


df = df.dropna(subset=['Customer ID'])

df = df[(df['Quantity'] > 0) & (df['Price'] > 0)]

customer_df = df.groupby('Customer ID').agg(
    TotalSpend=pd.NamedAgg(column='Price', aggfunc='sum'),
    PurchaseFrequency=pd.NamedAgg(column='Invoice', aggfunc=pd.Series.nunique),
    Recency=pd.NamedAgg(column='InvoiceDate', aggfunc=lambda x: (datetime.now() - x.max()).days)
)

customer_df['AverageBasketValue'] = customer_df['TotalSpend'] / customer_df['PurchaseFrequency']




In [None]:
def categorize_product(description):
    if pd.isna(description):
        return 'Unknown'
    description = description.lower()
    if 'light' in description:
        return 'Lighting'
    elif 'frame' in description:
        return 'Frames'
    elif 'ceramic' in description:
        return 'Ceramics'
    else:
        return 'Other'

df['ProductCategory'] = df['Description'].apply(categorize_product)


In [None]:
df['MonthOfPurchase'] = df['InvoiceDate'].dt.month
df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek  
df['TimeOfDay'] = df['InvoiceDate'].dt.hour


In [None]:
df = pd.get_dummies(df, columns=['Country', 'ProductCategory'])


In [None]:
variances = df.var()

threshold = 0.01 * variances.max()

low_variance_cols = variances[variances < threshold].index

df = df.drop(columns=low_variance_cols)

print("Columns removed due to low variance:", low_variance_cols)


In [None]:
print(df.describe())
print(df.head())


Importing Libraries
import pandas as pd: Imports the pandas library, a powerful tool for data manipulation and analysis, and gives it the alias pd.
import seaborn as sns: Imports the seaborn library, used for statistical data visualization, with the alias sns.
import matplotlib.pyplot as plt: Imports the pyplot module from the matplotlib library, which provides MATLAB-like plotting framework.
from sklearn.metrics import silhouette_score: Imports the silhouette_score function from scikit-learn's metrics module, used to evaluate the quality of clusters in clustering algorithms.
Data Loading and Initial Exploration
df = pd.read_excel('data (2).xlsx'): Loads an Excel file into a pandas DataFrame df.
print(df.head()): Displays the first five rows of the DataFrame for a quick overview.
print(df.info()): Provides a concise summary of the DataFrame, including the number of non-null entries for each column.
print(df.describe()): Generates descriptive statistics that summarize the central tendency, dispersion, and shape of the dataset’s distribution, excluding NaN values.
Data Type Conversions and Cleaning
df['Customer ID'] = df['Customer ID'].fillna(0).astype(int): Fills missing values in the 'Customer ID' column with 0 and converts the column to integer data type.
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce'): Converts the 'InvoiceDate' column to datetime format, setting invalid parsing as Not a Time (NaT).
df['Country'] = df['Country'].astype('category'): Converts the 'Country' column to a categorical data type for memory efficiency.
Data Filtering and Aggregation
Removing Rows: Rows with missing 'Customer ID' and those with non-positive 'Quantity' or 'Price' values are removed to clean the dataset.
Aggregating Data: The dataset is aggregated at the customer level to calculate total spend, purchase frequency, and recency of purchase. A new column 'AverageBasketValue' is also calculated.
Feature Engineering
Categorizing Products: A function categorize_product is defined and applied to the 'Description' column to categorize products into predefined categories.
Extracting Date Features: New columns are created to capture the month of purchase, day of the week, and time of day from the 'InvoiceDate' column.
One-Hot Encoding: The 'Country' and 'ProductCategory' columns are transformed into dummy/indicator variables for machine learning readiness.
Variance Thresholding: Columns with low variance are identified and removed, as they might not be informative for predictive modeling.

In [None]:
df.shape

In [None]:
print(df.describe())
print(df.head())


In [None]:
import numpy as np  

corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]


print("Features to drop due to high correlation:", to_drop)




In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numeric_cols = customer_df.select_dtypes(include=['float64', 'int64']).columns
customer_df_scaled = scaler.fit_transform(customer_df[numeric_cols])

customer_df_scaled = pd.DataFrame(customer_df_scaled, columns=numeric_cols, index=customer_df.index)



In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)  
customer_df_reduced = pca.fit_transform(customer_df_scaled)

print(f"PCA reduced the feature space to {customer_df_reduced.shape[1]} dimensions.")


In [None]:
from sklearn.cluster import KMeans


kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(customer_df_reduced)

customer_df['Cluster'] = clusters

print(kmeans.cluster_centers_)

for i in range(5):  
    print(f"\nCluster {i} characteristics:")
    cluster_members = customer_df[customer_df['Cluster'] == i]
    print(cluster_members.describe())  


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

linked = linkage(customer_df_reduced, 'ward')

plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.show()


from scipy.cluster.hierarchy import fcluster
clusters_hc = fcluster(linked, 5, criterion='maxclust')
customer_df['Cluster_HC'] = clusters_hc


In [None]:
from sklearn.cluster import DBSCAN


dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters_dbscan = dbscan.fit_predict(customer_df_reduced)

customer_df['Cluster_DBSCAN'] = clusters_dbscan



In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

linked = linkage(customer_df_reduced, method='ward')



In [None]:
plt.figure(figsize=(10, 7))
dendrogram(linked,
           orientation='top',
           distance_sort='descending',
           show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

n_components_range = range(1, 10)  
bic_scores = []

for n_components in n_components_range:
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(customer_df_reduced)
    bic_scores.append(gmm.bic(customer_df_reduced))

plt.plot(n_components_range, bic_scores, marker='o')
plt.title('BIC Scores by Number of Components')
plt.xlabel('Number of Components')
plt.ylabel('BIC Score')
plt.show()

optimal_n_components = n_components_range[np.argmin(bic_scores)]
print(f"Optimal number of components: {optimal_n_components}")


Feature Selection
High Correlation Feature Removal: Identifies and drops highly correlated features (correlation > 0.8) to avoid multicollinearity, which can distort the results of some models.
Scaling
Standard Scaling: Applies StandardScaler to normalize the numeric columns of customer_df, ensuring that each feature contributes equally to the analysis.
Dimensionality Reduction
PCA (Principal Component Analysis): Reduces the dimensionality of the scaled data while retaining 95% of the variance, making the dataset easier to work with and less prone to overfitting.
Clustering with KMeans
KMeans Clustering: Partitions the data into 5 clusters using the PCA-reduced features, then analyzes the characteristics of each cluster.
Hierarchical Clustering
Dendrogram: Visualizes the results of hierarchical clustering, providing insights into how the dataset could be grouped at various levels of granularity.
DBSCAN Clustering
DBSCAN (Density-Based Spatial Clustering of Applications with Noise): Applies DBSCAN to identify core samples of high density and expand clusters from them, useful for data with clusters of similar density.
Gaussian Mixture Models (GMM)
BIC for GMM: Uses the Bayesian Information Criterion (BIC) to determine the optimal number of components for a Gaussian Mixture Model, balancing model complexity with goodness of fit.

In [None]:
from sklearn.cluster import AgglomerativeClustering

agglo = AgglomerativeClustering(n_clusters=5)  # Adjust n_clusters based on your requirement
clusters_agglo = agglo.fit_predict(customer_df_reduced)

customer_df['Cluster_Agglo'] = clusters_agglo

silhouette_agglo = silhouette_score(customer_df_reduced, clusters_agglo)
print(f"Silhouette Score for Agglomerative Clustering: {silhouette_agglo}")


In [None]:
from sklearn.cluster import AffinityPropagation

affinity = AffinityPropagation(random_state=42)
clusters_affinity = affinity.fit_predict(customer_df_reduced)

customer_df['Cluster_Affinity'] = clusters_affinity

silhouette_affinity = silhouette_score(customer_df_reduced, clusters_affinity)
print(f"Silhouette Score for Affinity Propagation: {silhouette_affinity}")


In [None]:
from sklearn.cluster import MeanShift

mean_shift = MeanShift()
clusters_mean_shift = mean_shift.fit_predict(customer_df_reduced)

customer_df['Cluster_MeanShift'] = clusters_mean_shift

silhouette_mean_shift = silhouette_score(customer_df_reduced, clusters_mean_shift)
print(f"Silhouette Score for Mean Shift: {silhouette_mean_shift}")


In [None]:
from scipy.cluster.hierarchy import fcluster


distance_threshold = 50  
clusters = fcluster(linked, distance_threshold, criterion='distance')

customer_df['Cluster_Labels'] = clusters


In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(customer_df_reduced[:, 0], customer_df_reduced[:, 1], c=customer_df['Cluster'], cmap='viridis')
plt.title('Clusters visualization in 2D')
plt.show()

from sklearn.metrics import silhouette_score
silhouette_kmeans = silhouette_score(customer_df_reduced, customer_df['Cluster'])
silhouette_hc = silhouette_score(customer_df_reduced, customer_df['Cluster_HC'])

print(f"Silhouette Score for K-Means: {silhouette_kmeans}")
print(f"Silhouette Score for Hierarchical Clustering: {silhouette_hc}")


In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(customer_df_reduced[:, 0], customer_df_reduced[:, 1], c=customer_df['Cluster_DBSCAN'], cmap='viridis', s=50)
plt.title('DBSCAN Clusters visualization in 2D')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()


In [None]:
from sklearn.neighbors import NearestNeighbors

nearest_neighbors = NearestNeighbors(n_neighbors=5)
neighbors = nearest_neighbors.fit(customer_df_reduced)
distances, indices = neighbors.kneighbors(customer_df_reduced)
distances = np.sort(distances[:, -1])

plt.plot(distances)
plt.title('k-Distance Graph')
plt.xlabel('Points sorted by distance')
plt.ylabel('5th Nearest Neighbor Distance')
plt.show()




In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
customer_df_reduced = pca.fit_transform(customer_df.drop('Cluster', axis=1))

plt.figure(figsize=(10, 7))
plt.scatter(customer_df_reduced[:, 0], customer_df_reduced[:, 1], c=customer_df['Cluster'], cmap='viridis', s=50, alpha=0.6)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Customer Segments')
plt.colorbar(label='Cluster Label')
plt.show()


In [None]:
from sklearn.cluster import DBSCAN


dbscan = DBSCAN(eps=4200, min_samples=5)  
clusters_dbscan = dbscan.fit_predict(customer_df_reduced)


customer_df['Cluster_DBSCAN'] = clusters_dbscan


n_clusters = len(set(clusters_dbscan)) - (1 if -1 in clusters_dbscan else 0)
n_noise = list(clusters_dbscan).count(-1)

print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")


if n_clusters > 1:
    silhouette_dbscan = silhouette_score(customer_df_reduced[clusters_dbscan != -1], clusters_dbscan[clusters_dbscan != -1])
    print(f"Silhouette Score for DBSCAN: {silhouette_dbscan}")
else:
    print("Not enough clusters for Silhouette Score.")


In [None]:
from sklearn.metrics import silhouette_score

silhouette_kmeans = silhouette_score(customer_df_reduced, customer_df['Cluster'])
print(f"Silhouette Score for K-Means: {silhouette_kmeans}")

silhouette_hc = silhouette_score(customer_df_reduced, customer_df['Cluster_HC'])
print(f"Silhouette Score for Hierarchical Clustering: {silhouette_hc}")

if len(set(customer_df['Cluster_DBSCAN']) - {-1}) > 1:
    silhouette_dbscan = silhouette_score(customer_df_reduced[customer_df['Cluster_DBSCAN'] != -1], customer_df['Cluster_DBSCAN'][customer_df['Cluster_DBSCAN'] != -1])
    print(f"Silhouette Score for DBSCAN: {silhouette_dbscan}")
else:
    print("DBSCAN did not form distinct clusters, so Silhouette Score is not applicable.")

silhouette_agglo = silhouette_score(customer_df_reduced, customer_df['Cluster_Agglo'])
print(f"Silhouette Score for Agglomerative Clustering: {silhouette_agglo}")

silhouette_affinity = silhouette_score(customer_df_reduced, customer_df['Cluster_Affinity'])
print(f"Silhouette Score for Affinity Propagation: {silhouette_affinity}")

silhouette_mean_shift = silhouette_score(customer_df_reduced, customer_df['Cluster_MeanShift'])
print(f"Silhouette Score for Mean Shift: {silhouette_mean_shift}")


Feature Selection
High Correlation Feature Removal: Identifies and drops highly correlated features (correlation > 0.8) to avoid multicollinearity, which can distort the results of some models.
Scaling
Standard Scaling: Applies StandardScaler to normalize the numeric columns of customer_df, ensuring that each feature contributes equally to the analysis.
Dimensionality Reduction
PCA (Principal Component Analysis): Reduces the dimensionality of the scaled data while retaining 95% of the variance, making the dataset easier to work with and less prone to overfitting.
Clustering with KMeans
KMeans Clustering: Partitions the data into 5 clusters using the PCA-reduced features, then analyzes the characteristics of each cluster.
Hierarchical Clustering
Dendrogram: Visualizes the results of hierarchical clustering, providing insights into how the dataset could be grouped at various levels of granularity.
DBSCAN Clustering
DBSCAN (Density-Based Spatial Clustering of Applications with Noise): Applies DBSCAN to identify core samples of high density and expand clusters from them, useful for data with clusters of similar density.
Gaussian Mixture Models (GMM)
BIC for GMM: Uses the Bayesian Information Criterion (BIC) to determine the optimal number of components for a Gaussian Mixture Model, balancing model complexity with goodness of fit.

In [None]:
centroids = kmeans.cluster_centers_
print("Cluster centroids:\n", centroids)

plt.figure(figsize=(10, 8))
plt.scatter(customer_df_reduced[:, 0], customer_df_reduced[:, 1], c=customer_df['Cluster'], cmap='viridis', s=50)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.5, marker='X')  # Mark centroids
plt.title('K-Means Clusters Visualization in 2D PCA Space')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

for i in range(kmeans.n_clusters):
    cluster_members = customer_df[customer_df['Cluster'] == i]
    print(f"\nCluster {i} characteristics:")
    print(cluster_members[numeric_cols].describe())  # Summary statistics for numerical features in each cluster


Cluster 0
Size: Large (2904 members)
Total Spend: Moderate average spend, with a wide range, indicating a mix of spending behaviors.
Purchase Frequency: Moderate, suggesting occasional to regular purchasers.
Recency: A wider range in recency, with some customers not having purchased recently.
Average Basket Value: Varied, with some high-value purchases.
This cluster might represent a "General" segment with a broad mix of behaviors but leaning towards moderate spend and frequency.

Cluster 1
Size: Very small (1 member)
Total Spend: Extremely high, suggesting this might be an outlier or a very high-value customer.
Purchase Frequency: Extremely high, indicating frequent transactions.
Recency: Purchased very recently.
Average Basket Value: High.
Given its size, this cluster could be an "Outlier" or "High-Value" segment, possibly representing bulk purchases or institutional buying.

Cluster 2
Size: Medium (815 members)
Total Spend: Lower average spend, indicating smaller transactions.
Purchase Frequency: Lower, suggesting less frequent purchases.
Recency: More recent purchases.
Average Basket Value: Moderate, with some higher-value baskets.
This cluster might be "Occasional Shoppers" with recent, less frequent, and smaller transactions.

Cluster 3
Size: Very small (2 members)
Total Spend: High, likely indicating significant transactions.
Purchase Frequency: Very low, suggesting one-off or rare purchases.
Recency: Least recent purchasers.
Average Basket Value: Extremely high, suggesting premium or bulk purchases.
Like Cluster 1, this is likely an "Outlier" or "Premium" segment, possibly representing large, infrequent transactions.

Cluster 4
Size: Medium (591 members)
Total Spend: Low, indicating smaller transaction sizes.
Purchase Frequency: Low, suggesting infrequent purchases.
Recency: Least recent, indicating a lapse since the last purchase.
Average Basket Value: Relatively low, consistent with smaller, infrequent transactions.

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
customer_df_tsne = tsne.fit_transform(customer_df_reduced)

plt.figure(figsize=(10, 8))
plt.scatter(customer_df_tsne[:, 0], customer_df_tsne[:, 1], c=customer_df['Cluster'], cmap='viridis', s=50)
plt.title('Clusters Visualization with t-SNE')
plt.show()


In [None]:
for i in range(kmeans.n_clusters):
    cluster_members = customer_df[customer_df['Cluster'] == i]
    print(f"\nProfile for Cluster {i}:")
    display(cluster_members.describe()) 

In [None]:
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score

db_index = davies_bouldin_score(customer_df_reduced, customer_df['Cluster'])
print(f"Davies-Bouldin Index: {db_index}")

ch_index = calinski_harabasz_score(customer_df_reduced, customer_df['Cluster'])
print(f"Calinski-Harabasz Index: {ch_index}")


In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(customer_df_reduced):
    X_train, X_test = customer_df_reduced[train_index], customer_df_reduced[test_index]
    
    kmeans_cv = KMeans(n_clusters=5, random_state=42)
    kmeans_cv.fit(X_train)
    
    test_clusters = kmeans_cv.predict(X_test)
    
    silhouette_cv = silhouette_score(X_test, test_clusters)
    print(f"Silhouette Score on test split: {silhouette_cv}")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

key_features = ['TotalSpend', 'PurchaseFrequency', 'Recency', 'AverageBasketValue']

for i in range(customer_df['Cluster'].nunique()):
    print(f"\n--- Profile for Cluster {i} ---")
    cluster_data = customer_df[customer_df['Cluster'] == i]

    display(cluster_data[key_features].describe().round(2))
    
    fig, axes = plt.subplots(1, len(key_features), figsize=(20, 5), sharey=True)
    fig.suptitle(f'Distribution of Key Features in Cluster {i}')

    for ax, feature in zip(axes, key_features):
        sns.histplot(cluster_data[feature], kde=True, ax=ax)
        ax.set_title(feature)

    plt.tight_layout()
    plt.show()


In [None]:
print(customer_df.columns)


In [None]:
n_clusters = customer_df['Cluster'].nunique()

for i in range(n_clusters):
    cluster_data = customer_df[customer_df['Cluster'] == i]
    
    average_spend = cluster_data['TotalSpend'].mean()
    average_frequency = cluster_data['PurchaseFrequency'].mean()
    average_recency = cluster_data['Recency'].mean()
    average_basket_value = cluster_data['AverageBasketValue'].mean()

    print(f"\n--- Cluster {i} Insights ---")
    print(f"Average Spend: {average_spend:.2f}")
    print(f"Average Purchase Frequency: {average_frequency:.2f}")
    print(f"Average Recency: {average_recency:.2f} days ago")
    print(f"Average Basket Value: {average_basket_value:.2f}")

   
    if average_spend > customer_df['TotalSpend'].quantile(0.75):
        print("Action: Consider premium offers or loyalty programs for high spenders.")
    elif average_frequency > customer_df['PurchaseFrequency'].quantile(0.75):
        print("Action: Reward frequent shoppers with a loyalty program or exclusive deals.")
    elif average_recency > customer_df['Recency'].quantile(0.75):
        print("Action: Re-engage customers who haven't shopped recently with a 'We Miss You' campaign.")
    else:
        print("Action: Engage with standard promotions and aim to increase the basket value.")


In [None]:
cluster_0 = customer_df[customer_df['Cluster'] == 0]

print("General Shopper Characteristics:")
print(cluster_0[['TotalSpend', 'PurchaseFrequency', 'Recency', 'AverageBasketValue']].describe())


high_spend_customers = cluster_0[cluster_0['TotalSpend'] > cluster_0['TotalSpend'].quantile(0.75)]
print(f"\nNumber of High Spend Customers in Cluster 0: {len(high_spend_customers)}")




In [None]:
cluster_1 = customer_df[customer_df['Cluster'] == 1]

print("Occasional Shopper Characteristics:")
print(cluster_1[['TotalSpend', 'PurchaseFrequency', 'Recency', 'AverageBasketValue']].describe())



In [None]:
cluster_2 = customer_df[customer_df['Cluster'] == 2]

print("High Roller Characteristics:")
print(cluster_2[['TotalSpend', 'PurchaseFrequency', 'Recency', 'AverageBasketValue']].describe())




In [None]:
cluster_3 = customer_df[customer_df['Cluster'] == 3]

print("Bulk Buyer Characteristics:")
print(cluster_3[['TotalSpend', 'PurchaseFrequency', 'Recency', 'AverageBasketValue']].describe())




In [None]:
cluster_4 = customer_df[customer_df['Cluster'] == 4]

print("Infrequent Shopper Characteristics:")
print(cluster_4[['TotalSpend', 'PurchaseFrequency', 'Recency', 'AverageBasketValue']].describe())




GPT
This code block continues with the analysis of the customer dataset, focusing on visualizing the clusters formed by the K-Means algorithm, using dimensionality reduction techniques for visualization, and characterizing and evaluating the clusters through various metrics. Let's break it down:

Visualizing Clusters
Cluster Centroids: Prints the coordinates of the centroids of the clusters formed by K-Means.
2D Visualization using PCA: Plots the clusters in a 2D space defined by the first two principal components, marking cluster centroids for visual reference.
t-SNE for Cluster Visualization
t-SNE (t-Distributed Stochastic Neighbor Embedding): A tool to visualize high-dimensional data in a lower-dimensional space. It's used here to further visualize the clusters in a 2D space, potentially revealing the structure at the global and local level.
Cluster Characterization
Key Features Summary: Prints descriptive statistics for numerical features within each cluster, helping to understand the defining characteristics of each cluster.
Cluster Evaluation
Davies-Bouldin Index: A metric evaluating intra-cluster similarity and inter-cluster differences. Lower values indicate better clustering.
Calinski-Harabasz Index: Measures the cluster validity based on the ratio of between-cluster variance to within-cluster variance. Higher values typically indicate better-defined clusters.
Cross-Validation with KMeans
K-Fold Cross-Validation: Splits the dataset into k consecutive folds, then fits and predicts the K-Means model on these folds, evaluating the model's performance on unseen data using silhouette score.
Distribution of Key Features in Clusters
Histograms: Visualizes the distribution of key features like 'TotalSpend', 'PurchaseFrequency', 'Recency', and 'AverageBasketValue' across each cluster using histograms.
Cluster Insights and Actions
Cluster Insights: Calculates average values of key features for each cluster and suggests actions based on these insights, such as targeting high spenders with premium offers or re-engaging infrequent shoppers.
Detailed Cluster Analysis
General Characteristics: Provides descriptive statistics for each cluster, identifying patterns such as high spend, frequent purchases, recent activity, and average basket value.
Specific Cluster Analysis
Explores characteristics of specific clusters, identifying unique traits such as high spending, occasional shopping, bulk buying, and infrequent shopping behaviors, and suggests targeted strategies for each group.

Market Basket Analysis

In [None]:
!pip install mlxtend

In [None]:
!pip install openpyxl

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules



In [None]:
df2 = pd.read_excel('data (2).xlsx')


In [None]:
print(df2.head())

In [None]:
transactions_str = df2.groupby('Invoice')['Description'].apply(lambda items: [str(item) for item in items]).tolist()

from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions_str).transform(transactions_str)

one_hot_df = pd.DataFrame(te_ary, columns=te.columns_)



In [None]:
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(one_hot_df, min_support=0.01, use_colnames=True)


In [None]:
from mlxtend.frequent_patterns import association_rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


In [None]:
from mlxtend.frequent_patterns import apriori

min_support = 0.05

frequent_itemsets = apriori(one_hot_df, min_support=min_support, use_colnames=True)

print("Frequent Itemsets:")
print(frequent_itemsets)

from mlxtend.frequent_patterns import association_rules

min_confidence = 0.5
min_lift = 1.2

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

rules = rules[rules['lift'] > min_lift]

print("\nAssociation Rules:")
print(rules)


In [None]:
min_support = 0.03  
frequent_itemsets = apriori(one_hot_df, min_support=min_support, use_colnames=True)


In [None]:
min_confidence = 0.2  
min_lift = 1.0 
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
rules = rules[rules['lift'] > min_lift]


In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules


In [None]:
frequent_itemsets = apriori(one_hot_df, min_support=0.01, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)


In [None]:
pd.set_option('display.max_colwidth', None) 
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


Initial Setup and Data Loading
Importing Libraries: Necessary libraries like pandas and mlxtend (Machine Learning Extensions) are imported. mlxtend is particularly useful for its implementation of the Apriori algorithm and functions to generate association rules.
Data Loading: Transactional data is loaded from an Excel file using pd.read_excel. This data likely includes transactions where each row represents an item in a transaction, identified by an 'Invoice' number and described by a 'Description'.
Data Preprocessing for MBA
Transaction List Creation: The transactions are grouped by 'Invoice', and the 'Description' of items in each transaction is aggregated into lists.
Transaction Encoding: The TransactionEncoder from mlxtend.preprocessing is applied to these lists to create a one-hot encoded matrix, where each column represents an item, and each row represents a transaction, with 1 indicating the presence of the item in the transaction.
Conversion to DataFrame: The encoded matrix is converted back to a pandas DataFrame, creating a binary matrix suitable for the Apriori algorithm.
Applying the Apriori Algorithm
Frequent Itemsets Generation: Using mlxtend.frequent_patterns.apriori, frequent itemsets are identified based on a specified min_support threshold. These itemsets are combinations of items that appear together in transactions with frequency above the threshold.
Association Rules Generation: From the frequent itemsets, mlxtend.frequent_patterns.association_rules generates association rules that meet specified thresholds for metrics like confidence and lift. These rules indicate potential associations between items, with metrics providing insight into the strength and significance of these associations.
Iterative Analysis
The process is iteratively refined by adjusting parameters like min_support, min_confidence, and min_lift to explore different levels of item association and rule strength.
Output and Interpretation
Rules Display: The resulting association rules, along with their support, confidence, and lift metrics, are displayed. These metrics provide insights into the prevalence of rule antecedents and consequents in the dataset, the reliability of the rules, and the strength of the association compared to random chance.

In [None]:
high_value_rules = rules[(rules['confidence'] > 0.5) & (rules['lift'] > 1.2)]

print(high_value_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


In [None]:
rule = high_value_rules.iloc[0]
antecedents = rule['antecedents']
consequents = rule['consequents']
print(f"Antecedents: {antecedents}")
print(f"Consequents: {consequents}")
print(f"Support: {rule['support']}")
print(f"Confidence: {rule['confidence']}")
print(f"Lift: {rule['lift']}")


In [None]:
from IPython.display import display
display(high_value_rules)


In [None]:
for index, rule in rules.iterrows():
    antecedents = ', '.join([str(i) for i in list(rule['antecedents'])])
    consequents = ', '.join([str(i) for i in list(rule['consequents'])])
    
    print(f"Rule #{index + 1}")
    print(f"Antecedents: {antecedents}")
    print(f"Consequents: {consequents}")
    print(f"Support: {rule['support']:.4f}")
    print(f"Confidence: {rule['confidence']:.4f}")
    print(f"Lift: {rule['lift']:.4f}")
    print("-" * 50)


In [None]:
pip install networkx matplotlib


In [None]:
import altair as alt
import pandas as pd

simple_rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].copy()

simple_rules['antecedents'] = simple_rules['antecedents'].apply(lambda x: ', '.join(list(x)))
simple_rules['consequents'] = simple_rules['consequents'].apply(lambda x: ', '.join(list(x)))

simple_rules['rule'] = simple_rules['antecedents'] + " -> " + simple_rules['consequents']

top_rules = simple_rules.nlargest(20, 'lift')

chart = alt.Chart(top_rules).mark_circle(size=100).encode(
    x='lift',
    y='confidence',
    size='support',
    color='lift',
    tooltip=['rule', 'lift', 'confidence', 'support']
).properties(
    width=700,
    height=400,
    title='Top 20 Association Rules by Lift'
)

chart.display()


In [None]:
import altair as alt

top_lift_rules = simple_rules.nlargest(10, 'lift')[['rule', 'lift']]

chart = alt.Chart(top_lift_rules).mark_bar().encode(
    x=alt.X('lift', sort=None),
    y=alt.Y('rule', sort='-x'),
    color='lift',
    tooltip=['rule', 'lift']
).properties(
    width=700,
    height=300,
    title='Top 10 Association Rules by Lift'
)

chart.display()


In [None]:
!pip install plotly

In [None]:
import plotly.express as px


In [None]:


rules['antecedents'] = rules['antecedents'].apply(lambda x: ', '.join(list(x)))
rules['consequents'] = rules['consequents'].apply(lambda x: ', '.join(list(x)))

fig = px.scatter(rules, x='support', y='confidence', color='lift',
                 hover_data=['antecedents', 'consequents'])
fig.update_layout(title='Interactive Plot of Association Rules',
                  xaxis_title='Support',
                  yaxis_title='Confidence')
fig.show()

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules


frequent_itemsets = fpgrowth(one_hot_df, min_support=0.01, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

filtered_rules = rules[(rules['lift'] >= 3) & (rules['confidence'] >= 0.3)]

print(filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


In [None]:
import pandas as pd


high_lift_rules = rules[rules['lift'] > 10]

sorted_high_lift_rules = high_lift_rules.sort_values(by='lift', ascending=False)

print("Top Association Rules by Lift Value:")
print(sorted_high_lift_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())

top_rule = sorted_high_lift_rules.iloc[0]
antecedents = ', '.join(list(top_rule['antecedents']))
consequents = ', '.join(list(top_rule['consequents']))
print(f"\nInterpretation Example:")
print(f"Rule: {antecedents} -> {consequents}")
print(f"Lift: {top_rule['lift']:.2f} suggests a very strong association between these items.")
print("This could indicate a thematic connection appealing to customer preferences for cohesive, themed purchases.")
print("Considering this rule's support and confidence, it strikes a balance between being a common enough pattern (support) and a reliable predictor (confidence).")
pd.set_option('display.max_rows', None)


In [None]:
import plotly.graph_objects as go

sorted_high_lift_rules['antecedents_str'] = sorted_high_lift_rules['antecedents'].apply(lambda x: ', '.join(list(x)))
sorted_high_lift_rules['consequents_str'] = sorted_high_lift_rules['consequents'].apply(lambda x: ', '.join(list(x)))

fig = go.Figure(data=[go.Table(
    header=dict(values=['Antecedents', 'Consequents', 'Support', 'Confidence', 'Lift'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[sorted_high_lift_rules['antecedents_str'].head(),
                       sorted_high_lift_rules['consequents_str'].head(),
                       sorted_high_lift_rules['support'].head().apply(lambda x: f"{x:.3f}"),
                       sorted_high_lift_rules['confidence'].head().apply(lambda x: f"{x:.3f}"),
                       sorted_high_lift_rules['lift'].head().apply(lambda x: f"{x:.2f}")],
               fill_color='lavender',
               align='left'))
])

top_rule = sorted_high_lift_rules.iloc[0]

fig.add_annotation(dict(
    showarrow=False,
    xref='paper', yref='paper',
    x=0, y=-0.2,
    text=("Interpretation Example:<br>"
          f"Rule: {top_rule['antecedents_str']} -> {top_rule['consequents_str']}<br>"
          f"Lift: {top_rule['lift']:.2f} suggests a very strong association between these items.<br>"
          "This could indicate a thematic connection appealing to customer preferences for cohesive, themed purchases.<br>"
          "Considering this rule's support and confidence, it strikes a balance between being a common enough pattern (support) "
          "and a reliable predictor (confidence)."),
    align='left'
))

fig.update_layout(
    margin=dict(t=30, b=100), 
    title_text="Top Association Rules by Lift Value"
)

fig.show()


In [None]:
fig = go.Figure(data=[go.Table(
    header=dict(values=['Antecedents', 'Consequents', 'Support', 'Confidence', 'Lift'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[
                       sorted_high_lift_rules.antecedents.head().apply(lambda x: ', '.join(list(x))),
                       sorted_high_lift_rules.consequents.head().apply(lambda x: ', '.join(list(x))),
                       sorted_high_lift_rules.support.head().apply(lambda x: f"{x:.3f}"),
                       sorted_high_lift_rules.confidence.head().apply(lambda x: f"{x:.3f}"),
                       sorted_high_lift_rules.lift.head().apply(lambda x: f"{x:.2f}")
               ],
               fill_color='lavender',
               align='left'))
])

fig.add_annotation(dict(
    showarrow=False,
    xref='paper', yref='paper',
    x=0, y=-0.2,
    text=("Interpretation Example:<br>"
          f"Rule: {' , '.join(list(antecedents))} -> {' , '.join(list(consequents))}<br>"
          f"Lift: {top_rule['lift']:.2f} suggests a very strong association between these items.<br>"
          "This could indicate a thematic connection appealing to customer preferences for cohesive, themed purchases.<br>"
          "Considering this rule's support and confidence, it strikes a balance between being a common enough pattern (support) "
          "and a reliable predictor (confidence)."),
    align='left'
))

fig.update_layout(
    margin=dict(t=30, b=100), 
    title_text="Top Association Rules by Lift Value"
)

fig.show()


In [None]:
import plotly.graph_objs as go
import pandas as pd


rules['antecedents'] = rules['antecedents'].apply(lambda x: ', '.join(list(x)))
rules['consequents'] = rules['consequents'].apply(lambda x: ', '.join(list(x)))
rules['label'] = rules['antecedents'] + " -> " + rules['consequents']

fig = go.Figure(data=[go.Scatter3d(
    x=rules['support'],
    y=rules['confidence'],
    z=rules['lift'],
    text=rules['label'],
    mode='markers',
    marker=dict(
        size=7,
        color=rules['lift'],              
        colorscale='Viridis', 
        opacity=0.8
    )
)])


fig.update_layout(
    title='3D Scatter Plot of Association Rules',
    scene = dict(
        xaxis_title='Support',
        yaxis_title='Confidence',
        zaxis_title='Lift'
    ),
    margin=dict(l=0, r=0, b=0, t=0) 
)

fig.show()


High-Value Rules Filtering
Filters association rules to retain those with high confidence and lift, indicating strong and potentially useful relationships between itemsets.
Rule Analysis
Selects the first high-value rule for a detailed analysis, breaking down its antecedents (if part), consequents (then part), support, confidence, and lift, providing a deeper understanding of individual rules.
Visualization with Altair
Uses Altair to visualize the top association rules by lift, highlighting the most significant relationships discovered in the dataset.
Visualization with Plotly
Employs Plotly to create interactive visualizations, such as a scatter plot and a 3D scatter plot, enabling a dynamic exploration of the rules based on support, confidence, and lift.
Generates a table visualization of the top association rules for a clear and concise presentation of the findings.
FP-Growth Algorithm
Applies the FP-Growth algorithm, an efficient alternative to the Apriori algorithm for finding frequent itemsets, to the one-hot encoded dataset, followed by the generation of association rules from these itemsets.
Detailed Rule Interpretation
Provides a detailed interpretation of a top rule by analyzing its lift value and the implications of the antecedents leading to the consequents, aiding in the understanding of customer purchase behavior and thematic connections between items.
Visualization Enhancements
Enhances visualizations with annotations and table formats for better interpretation of the top association rules, making it easier to communicate findings to a non-technical audience.


References

Altair Development Team (2024) Altair: Interactive Statistical Visualizations for Python. Available at: https://altair-viz.github.io (Accessed: 02 April 2024).

Han, J., Pei, J., and Kamber, M. (2011) Data Mining: Concepts and Techniques. 3rd edn. Morgan Kaufmann Publishers.

Harris, C.R., Millman, K.J., van der Walt, S.J. et al. (2020) 'Array programming with NumPy', Nature, 585, pp. 357–362. Available at: https://doi.org/10.1038/s41586-020-2649-2 (Accessed: 02 April 2024).

Hunter, J.D. (2007) 'Matplotlib: A 2D Graphics Environment', Computing in Science & Engineering, 9(3), pp. 90-95. Available at: https://doi.org/10.1109/MCSE.2007.55 (Accessed: 02 April 2024).

McKinney, W. (2010) Data Structures for Statistical Computing in Python. Proceedings of the 9th Python in Science Conference, pp. 51-56. Available at: https://conference.scipy.org/proceedings/scipy2010/mckinney.html (Accessed: 02 April 2024).

Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., Blondel, M., Prettenhofer, P., Weiss, R., Dubourg, V., Vanderplas, J., Passos, A., Cournapeau, D., Brucher, M., Perrot, M., and Duchesnay, E. (2011) 'Scikit-learn: Machine Learning in Python', Journal of Machine Learning Research, 12, pp. 2825-2830. Available at: http://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html (Accessed: 02 April 2024).

Raschka, S. (2015) Python Machine Learning. Packt Publishing Ltd.

Seaborn Development Team (2024) Seaborn: Statistical Data Visualization. Available at: https://seaborn.pydata.org/ (Accessed: 02 April 2024).

Waskom, M. et al. (2024) mwaskom/seaborn: v0.11.0 (September 2020), Zenodo. Available at: https://doi.org/10.5281/zenodo.4019143 (Accessed: 02 April 2024).

