In [15]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from pathlib import Path
from mpl_toolkits.mplot3d import Axes3D

# Consistent styling & helpers
# Define consistent color palette and reusable axis formatters for all visuals.
# This ensures that every chart across the project has the same style and readability.

PALETTE = {0: "#1f77b4", 1: "#ff7f0e", 2: "#2ca02c"}  # color-consistent clusters

# Apply global seaborn theme for all plots
sns.set_style("whitegrid")

import matplotlib.ticker as mtick
def fmt_dollars(ax):
    ax.xaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))
def fmt_int(ax):
    ax.yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))

# Segment "centers" in ORIGINAL units
def segment_centers(df):
    return (df.groupby("Cluster")[["Age","Annual Income ($)","Spending Score (1-100)"]]
              .mean()
              .rename_axis("Cluster"))

# Set up paths
FIGDIR = Path("reports/figures")
FIGDIR.mkdir(parents=True, exist_ok=True)

# Load data
raw = pd.read_csv("..\data\Customers.csv")
assign = pd.read_csv("..\data\processed\cluster_assignments.csv")

# Merge clustering results with original data
df = raw.merge(assign, on="CustomerID", how="left")

print("Data loaded and merged successfully.")
print("Shape:", df.shape)
df.head()


Data loaded and merged successfully.
Shape: (2000, 9)


Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size,Cluster
0,1,Male,19,15000,39,Healthcare,1,4,2
1,2,Male,21,35000,81,Engineer,3,3,2
2,3,Female,20,86000,6,Engineer,1,1,2
3,4,Female,23,59000,77,Lawyer,0,2,2
4,5,Female,31,38000,40,Entertainment,2,6,2


In [16]:
# 2D Scatterplots
# Visual 1: Annual Income vs Spending Score
# Classic customer segmentation view showing spending behavior by income bracket.
# Cluster centers (X markers) represent average values per group.

plt.figure(figsize=(8.5,5))
sns.scatterplot(data=df, x="Annual Income ($)", y="Spending Score (1-100)",
                hue="Cluster", palette=PALETTE, alpha=0.8, s=30)
plt.title("Customer Segments: Annual Income vs Spending Score")
plt.xlabel("Annual Income ($)")
plt.ylabel("Spending Score (1-100)")

# Overlay segment "centers"
cent = segment_centers(df)
plt.scatter(
    cent["Annual Income ($)"], cent["Spending Score (1-100)"],
    marker="X", s=220,
    c=[PALETTE[c] for c in cent.index], edgecolor="white", linewidth=1.2,
    label="Segment center"
)

# Format & layout
ax = plt.gca()
ax.set_xlim(left=0)                    # start x at $0
ax.margins(x=0.02)                     # tiny right margin
ax.set_ylim(0, 100)                    # fixed y-range for consistency across plots
plt.xticks(rotation=45, ha='right')
plt.locator_params(axis='x', nbins=6)  # fewer, cleaner x ticks
fmt_dollars(ax)                        # $ and comma formatting
ax.grid(alpha=0.25)                    # subtle grid

# Legend outside
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig(FIGDIR / "clusters_income_spending.png", bbox_inches="tight")
plt.close()

In [17]:
# Visual 2: Age vs Spending Score
# Reveals generational purchasing differences between clusters.
# The grid improves readability for presentations.

plt.figure(figsize=(7,5))
sns.scatterplot(data=df, x="Age", y="Spending Score (1-100)",
                hue="Cluster", palette=PALETTE, alpha=0.8)
plt.title("Customer Segments: Age vs Spending Score")
plt.xlabel("Age")
plt.ylabel("Spending Score (1-100)")

# Formatting & legend before saving
ax = plt.gca()
ax.grid(alpha=0.3)
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig(FIGDIR / "clusters_age_spending.png", bbox_inches="tight")
plt.close()

In [18]:
# PCA Visualization (Dimensionality Reduction)
# Visual 3: PCA projection
# Reduces three numerical dimensions (Age, Income, Spending Score) 
# into two principal components while preserving most variance.
# Useful to visually confirm separation between customer clusters.

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

features = ["Age", "Annual Income ($)", "Spending Score (1-100)"]
X = df[features].dropna()

#Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit PCA
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# Combine PCA results with clusters
pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2"], index=X.index)
pca_df["Cluster"] = df.loc[X.index, "Cluster"].to_numpy()

plt.figure(figsize=(7,5))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="Cluster",
                palette=PALETTE, alpha=0.8)
ax = plt.gca()
ax.grid(alpha=0.3)
ax.axhline(0, color='grey', lw=0.7, alpha=0.5)
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
var = pca.explained_variance_ratio_
total = var.sum()
plt.title(f"PCA 2D Projection (PC1 {var[0]:.0%}, PC2 {var[1]:.0%}, Total {total:.0%})")
plt.tight_layout()
plt.savefig(FIGDIR / "clusters_pca.png", bbox_inches="tight")
plt.close()

In [19]:
# 3D Scatterplot
# Visual 4: 3D view of Age, Income, and Spending
# Provides an interactive-looking visualization for slides and storytelling.
# Camera adjusted for balanced perspective (elev=22, azim=35).

import matplotlib.patches as mpatches

fig = plt.figure(figsize=(9,7))
ax = fig.add_subplot(111, projection="3d")

colors = df["Cluster"].map(PALETTE)

sc = ax.scatter(
    df["Age"], 
    df["Annual Income ($)"], 
    df["Spending Score (1-100)"],
    c=colors, s=40, alpha=0.6
)

ax.set_xlabel("Age")
ax.set_ylabel("Annual Income ($)")
ax.set_zlabel("Spending Score (1-100)")
ax.set_title("3D Visualization of Customer Clusters")

ax.view_init(elev=25, azim=40)

patches = [mpatches.Patch(color=PALETTE[i], label=f"Cluster {i}") 
           for i in sorted(df["Cluster"].unique())]
ax.legend(handles=patches, title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')

ax.grid(alpha=0.3)
ax.set_facecolor("#fafafa")

plt.suptitle(
    "Cluster 0: Young, high-income, high-spending; Cluster 1: Older, high-income, low-spending",
    fontsize=9, y=0.98, color="#555")
plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.9)
plt.savefig(FIGDIR / "clusters_3D_age_income_spending.png", bbox_inches="tight", pad_inches=0.3)
plt.close()

In [20]:
# Cluster Profiles & Distribution

from pathlib import Path

# Average attribute values per cluster
cluster_summary = (df.groupby("Cluster")[["Age","Annual Income ($)","Spending Score (1-100)"]]
                     .mean().round(1))
counts = df["Cluster"].value_counts().sort_index()

# Visual 5: Per-metric averages
# Breaks down how clusters differ by each key numeric attribute.
# Individual charts prevent income scale from overshadowing other features.

for col, label in [("Annual Income ($)", "Average Annual Income ($)"),
                   ("Spending Score (1-100)", "Average Spending Score (1-100)"),
                   ("Age", "Average Age (years)")]:
    plt.figure(figsize=(6,4))
    ax = sns.barplot(x=cluster_summary.index, y=cluster_summary[col],
                 hue=cluster_summary.index, dodge=False,
                 palette=[PALETTE[i] for i in cluster_summary.index])
    ax.get_legend().remove()
    ax.set_title(label); 
    ax.set_xlabel("Cluster"); 
    ax.set_ylabel(label)

    # Format Income axis with $ and comma separators
    if "Income" in col:
        ax.yaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))

    # Add value labels above bars
    for i, v in enumerate(cluster_summary[col].values):
        txt = f"{v:,.0f}" if "Income" in col else f"{v:.1f}"
        ax.text(i, v, txt, ha="center", va="bottom")

    # Add a bit of headroom so labels don't clip the frame
    ymax = cluster_summary[col].max()
    ax.set_ylim(0, ymax * 1.15)

    plt.tight_layout()
    plt.savefig(FIGDIR / f"avg_{col.replace(' ','_').replace('($)','dollars')}.png", bbox_inches="tight")
    plt.close()

# Visual 6: Customer counts per cluster
# Each bar label includes both absolute count and percentage.

plt.figure(figsize=(6,4))
ax = counts.plot(kind="bar", color=[PALETTE[i] for i in counts.index])

# Add count + % labels
for i, v in enumerate(counts.values):
    ax.text(i, v, f"{v} ({v/counts.sum():.1%})", ha="center", va="bottom")

# Give labels more room
ax.set_ylim(0, counts.max() * 1.15)

ax.set_title("Customers per Cluster"); 
ax.set_xlabel("Cluster"); 
ax.set_ylabel("Number of Customers")
plt.xticks(rotation=0)
fmt_int(ax)

plt.tight_layout()
plt.savefig(FIGDIR / "cluster_counts.png", bbox_inches="tight")
plt.close()

In [21]:
# Confirmation

print("All visualizations saved to:", FIGDIR.resolve())
list(FIGDIR.glob("*.png"))

All visualizations saved to: C:\Users\Olenka\DSI_2\notebooks\reports\figures


[WindowsPath('reports/figures/avg_Age.png'),
 WindowsPath('reports/figures/avg_Annual_Income_dollars.png'),
 WindowsPath('reports/figures/avg_Spending_Score_(1-100).png'),
 WindowsPath('reports/figures/clusters_3D_age_income_spending.png'),
 WindowsPath('reports/figures/clusters_age_spending.png'),
 WindowsPath('reports/figures/clusters_income_spending.png'),
 WindowsPath('reports/figures/clusters_pca.png'),
 WindowsPath('reports/figures/cluster_counts.png')]