In [None]:
!pip install numpy pandas scikit-learn matplotlib seaborn

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Data Simulation/Loading ---
# In a real project, replace this with loading your own data.
# E.g., df = pd.read_csv('your_customer_data.csv')

def generate_synthetic_customer_data(num_customers=40000):
    """
    Generates a synthetic dataset for customer segmentation.
    Data is designed to have distinct clusters based on spending behavior.
    """
    np.random.seed(42)
    
    # Create distinct customer segments
    # Segment 1: High Spenders, High Frequency
    high_spenders = pd.DataFrame({
        'customer_id': range(10000),
        'total_spent': np.random.normal(5000, 1000, 10000),
        'transactions': np.random.normal(50, 10, 10000),
        'product_category_preference': np.random.choice(['Electronics', 'Luxury', 'Fashion'], 10000, p=[0.5, 0.3, 0.2])
    })
    
    # Segment 2: Low Spenders, High Frequency
    low_spenders_freq = pd.DataFrame({
        'customer_id': range(10000, 20000),
        'total_spent': np.random.normal(500, 100, 10000),
        'transactions': np.random.normal(60, 15, 10000),
        'product_category_preference': np.random.choice(['Groceries', 'Home Goods', 'Beauty'], 10000, p=[0.6, 0.2, 0.2])
    })
    
    # Segment 3: Low Spenders, Low Frequency
    low_spenders_low_freq = pd.DataFrame({
        'customer_id': range(20000, 30000),
        'total_spent': np.random.normal(300, 50, 10000),
        'transactions': np.random.normal(5, 2, 10000),
        'product_category_preference': np.random.choice(['Books', 'Movies', 'Sports'], 10000, p=[0.4, 0.4, 0.2])
    })
    
    # Segment 4: Medium Spenders, Medium Frequency
    medium_spenders = pd.DataFrame({
        'customer_id': range(30000, 40000),
        'total_spent': np.random.normal(1500, 300, 10000),
        'transactions': np.random.normal(25, 5, 10000),
        'product_category_preference': np.random.choice(['Fashion', 'Home Goods', 'Electronics'], 10000, p=[0.4, 0.4, 0.2])
    })
    
    # Combine all segments
    df = pd.concat([high_spenders, low_spenders_freq, low_spenders_low_freq, medium_spenders], ignore_index=True)
    df['total_spent'] = df['total_spent'].clip(lower=0)
    df['transactions'] = df['transactions'].clip(lower=1)
    
    return df

# Load the data
df = generate_synthetic_customer_data()
print("Synthetic Customer Data Head:")
print(df.head())
print("\nDataFrame Shape:", df.shape)

# --- 2. Feature Engineering ---

def feature_engineering(df):
    """
    Derives key behavioral features for clustering.
    """
    # Create new features (e.g., average transaction value)
    df['avg_transaction_value'] = df['total_spent'] / df['transactions']
    
    # Select the features for clustering
    features = ['total_spent', 'transactions', 'avg_transaction_value']
    X = df[features]
    
    return X, features

X, features = feature_engineering(df.copy())

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nFeatures for Clustering (after scaling):")
print(pd.DataFrame(X_scaled, columns=features).head())

# --- 3. Customer Segmentation ---

# Determine the optimal number of clusters (e.g., using the Elbow Method)
def find_optimal_clusters(data, max_k=10):
    inertia = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(data)
        inertia.append(kmeans.inertia_)
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, max_k + 1), inertia, marker='o')
    plt.title('Elbow Method for Optimal K')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Inertia')
    plt.show()

print("\nFinding Optimal K for K-Means (Elbow Method):")
find_optimal_clusters(X_scaled)
# Based on the plot, we'll choose K=4 as the clusters are clearly separated in the synthetic data.
optimal_k = 4

# a) K-Means Clustering
print("\n--- Performing K-Means Clustering ---")
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['kmeans_cluster'] = kmeans.fit_predict(X_scaled)

# b) Gaussian Mixture Model (GMM)
print("\n--- Performing Gaussian Mixture Model Clustering ---")
gmm = GaussianMixture(n_components=optimal_k, random_state=42)
df['gmm_cluster'] = gmm.fit_predict(X_scaled)

# Analyze and visualize the clusters
print("\nK-Means Cluster Sizes:")
print(df['kmeans_cluster'].value_counts().sort_index())
print("\nGMM Cluster Sizes:")
print(df['gmm_cluster'].value_counts().sort_index())

# Visualize the clusters
def visualize_clusters(df, x_feature, y_feature, cluster_col, title):
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x=x_feature, y=y_feature, hue=cluster_col, data=df, palette='viridis', legend='full', s=50, alpha=0.7)
    plt.title(title, fontsize=16)
    plt.xlabel(x_feature.replace('_', ' ').title(), fontsize=12)
    plt.ylabel(y_feature.replace('_', ' ').title(), fontsize=12)
    plt.show()

print("\nVisualizing K-Means Clusters:")
visualize_clusters(df, 'total_spent', 'transactions', 'kmeans_cluster', 'K-Means Customer Segments')

print("\nVisualizing GMM Clusters:")
visualize_clusters(df, 'total_spent', 'transactions', 'gmm_cluster', 'GMM Customer Segments')

# Characterize the clusters
def characterize_clusters(df, cluster_col):
    cluster_profiles = df.groupby(cluster_col)[features].mean().reset_index()
    return cluster_profiles

print("\nK-Means Cluster Profiles (Mean of features):")
print(characterize_clusters(df, 'kmeans_cluster'))

print("\nGMM Cluster Profiles (Mean of features):")
print(characterize_clusters(df, 'gmm_cluster'))

# --- 4. Recommendation Layer & Actionable Insights ---

def generate_recommendations(df, cluster_col):
    """
    Generates product recommendations based on cluster behavior.
    In a real-world scenario, this would use a product database.
    Here, we'll use the 'product_category_preference' column.
    """
    print(f"\n--- Generating Recommendations based on {cluster_col} ---")
    
    # Find the most popular product category within each cluster
    cluster_recommendations = df.groupby(cluster_col)['product_category_preference'].agg(lambda x: x.mode()[0]).to_dict()
    
    # Store cluster profiles for actionable insights
    cluster_profiles_df = characterize_clusters(df, cluster_col)
    
    recommendation_insights = {}
    for cluster_id, recommendation in cluster_recommendations.items():
        profile = cluster_profiles_df[cluster_profiles_df[cluster_col] == cluster_id].iloc[0]
        
        # Actionable insight text generation
        insight = (
            f"Cluster {cluster_id}: These customers spend an average of ${profile['total_spent']:.2f} across "
            f"{profile['transactions']:.1f} transactions. Their average transaction value is ${profile['avg_transaction_value']:.2f}. "
            f"**Recommendation**: Target this group with promotions for '{recommendation}'. This will boost cross-sell."
        )
        recommendation_insights[cluster_id] = insight
    
    return recommendation_insights

# Generate recommendations for K-Means clusters
kmeans_insights = generate_recommendations(df, 'kmeans_cluster')
for cluster, insight in kmeans_insights.items():
    print(f"\n{insight}")

# Generate recommendations for GMM clusters
gmm_insights = generate_recommendations(df, 'gmm_cluster')
for cluster, insight in gmm_insights.items():
    print(f"\n{insight}")

# Simulating the impact of the recommendation system
# This is a conceptual example of how "cross-sell by 20%" would be measured.
# In a real scenario, this would be a A/B test.
# Here, we'll just show the concept.
def calculate_cross_sell_boost(df):
    """
    Simulates cross-sell boost by assuming a certain percentage of customers
    in a cluster would buy a recommended product.
    """
    # Baseline: Assume 1% of customers buy a cross-sell product without recommendations
    baseline_cross_sell = 0.01 * len(df)
    
    # With recommendations: Assume the boost is achieved on a portion of the customer base
    # Let's say we target 50% of our customers with the recommendation.
    # And the recommendation success rate is 20% higher than the baseline.
    customers_targeted = 0.5 * len(df)
    boosted_success_rate = 0.01 * 1.2 # 20% boost
    
    boosted_cross_sell = customers_targeted * boosted_success_rate
    
    total_new_sales = boosted_cross_sell - baseline_cross_sell
    
    print(f"\n--- Simulated Impact of Recommendation System ---")
    print(f"Total customers: {len(df):,}")
    print(f"Simulated new cross-sells due to recommendations: {boosted_cross_sell:.0f} units.")

calculate_cross_sell_boost(df)