In [3]:
def prepare_clustering_features(customers_df=None, transactions_df=None, products_df=None):
    """
    Prepare features for customer segmentation clustering using the three provided datasets.
    
    Parameters:
    customers_df (pandas.DataFrame): Customer information
    transactions_df (pandas.DataFrame): Transaction data
    products_df (pandas.DataFrame): Product information
    
    Returns:
    pandas.DataFrame: Prepared features for clustering
    """
    try:
        if any(df is None for df in [customers_df, transactions_df, products_df]):
            # Load your data files
            customers_df = pd.read_csv('Customers.csv')
            transactions_df = pd.read_csv('Transactions.csv')
            products_df = pd.read_csv('Products.csv')
        
        # Initialize features DataFrame
        features = pd.DataFrame()
        
        # 1. Basic customer features
        customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
        features['account_age_days'] = (pd.Timestamp.now() - customers_df['SignupDate']).dt.days
        
        # 2. Transaction-based features (RFM)
        # Recency
        transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
        latest_date = transactions_df['TransactionDate'].max()
        customer_last_purchase = transactions_df.groupby('CustomerID')['TransactionDate'].max()
        features['recency'] = (latest_date - customer_last_purchase).dt.days
        
        # Frequency
        features['frequency'] = transactions_df.groupby('CustomerID')['TransactionID'].count()
        
        # Monetary
        customer_totals = transactions_df.groupby('CustomerID').agg({
            'TotalValue': ['sum', 'mean', 'std'],
            'Quantity': ['sum', 'mean', 'std']
        })
        
        # Flatten column names
        customer_totals.columns = [
            'monetary_total', 'avg_transaction_value', 'std_transaction_value',
            'total_items', 'avg_items_per_transaction', 'std_items'
        ]
        
        # Add to features
        for col in customer_totals.columns:
            features[col] = customer_totals[col]
        
        # 3. Product category preferences
        # Merge transactions with products to get categories
        trans_with_categories = transactions_df.merge(
            products_df[['ProductID', 'Category']], 
            on='ProductID'
        )
        
        # Calculate category preferences
        category_pivot = pd.pivot_table(
            trans_with_categories,
            index='CustomerID',
            columns='Category',
            values='Quantity',
            aggfunc='sum',
            fill_value=0
        )
        
        # Normalize category preferences
        category_sums = category_pivot.sum(axis=1)
        category_preferences = category_pivot.div(category_sums, axis=0).fillna(0)
        
        # Add category preferences to features
        for col in category_preferences.columns:
            features[f'category_pref_{col}'] = category_preferences[col]
        
        # Handle missing values
        features = features.fillna(0)
        
        # Scale the features
        scaler = StandardScaler()
        features_scaled = pd.DataFrame(
            scaler.fit_transform(features),
            columns=features.columns,
            index=features.index
        )
        
        print(f"✓ Prepared features for {len(features)} customers")
        return features_scaled
        
    except Exception as e:
        print(f"× Error preparing clustering features: {str(e)}")
        raise

def perform_clustering_analysis(features, max_clusters=10):
    """
    Perform K-means clustering analysis with different numbers of clusters
    and evaluate using multiple metrics.
    
    Parameters:
    features (pandas.DataFrame): Prepared and scaled features for clustering
    max_clusters (int): Maximum number of clusters to try
    
    Returns:
    tuple: (clustered_data, metrics_df, X, best_kmeans)
    """
    try:
        # Convert features to numpy array for clustering
        X = features.values
        
        # Initialize metrics storage
        metrics_list = []
        
        # Try different numbers of clusters
        for n_clusters in range(2, max_clusters + 1):
            # Perform K-means clustering
            kmeans = KMeans(
                n_clusters=n_clusters,
                n_init=10,
                random_state=42
            )
            cluster_labels = kmeans.fit_predict(X)
            
            # Calculate metrics
            metrics_list.append({
                'n_clusters': n_clusters,
                'db_score': davies_bouldin_score(X, cluster_labels),
                'silhouette_score': silhouette_score(X, cluster_labels),
                'calinski_harabasz_score': calinski_harabasz_score(X, cluster_labels)
            })
            
            print(f"✓ Evaluated clustering with {n_clusters} clusters")
        
        # Convert metrics to DataFrame
        metrics_df = pd.DataFrame(metrics_list)
        
        # Find optimal number of clusters (using Davies-Bouldin Index)
        optimal_clusters = metrics_df.loc[metrics_df['db_score'].idxmin()]['n_clusters']
        
        # Perform final clustering with optimal number of clusters
        final_kmeans = KMeans(
            n_clusters=int(optimal_clusters),
            n_init=10,
            random_state=42
        )
        
        # Add cluster assignments to original features
        clustered_data = features.copy()
        clustered_data['Cluster'] = final_kmeans.fit_predict(X)
        
        return clustered_data, metrics_df, X, final_kmeans
        
    except Exception as e:
        print(f"× Error performing clustering analysis: {str(e)}\")")
        raise

# Modified execution code
# Load data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

# Execute clustering analysis
print("1. Preparing features for clustering...")
clustering_features = prepare_clustering_features(customers_df, transactions_df, products_df)

print("\n2. Performing clustering analysis...")
clustered_data, metrics, X, final_kmeans = perform_clustering_analysis(clustering_features)

print("\n3. Clustering Results:")
print(f"✓ Optimal number of clusters: {len(clustered_data['Cluster'].unique())}")
print(f"✓ Davies-Bouldin Index: {metrics.iloc[metrics['db_score'].idxmin()]['db_score']:.4f}")
print(f"✓ Silhouette Score: {metrics.iloc[metrics['db_score'].idxmin()]['silhouette_score']:.4f}")
print(f"✓ Calinski-Harabasz Score: {metrics.iloc[metrics['db_score'].idxmin()]['calinski_harabasz_score']:.4f}")

# Rest of the visualization and saving code remains the same...

print("\n4. Cluster sizes:")
print(clustered_data['Cluster'].value_counts().sort_index())

print("\n5. Creating visualizations and saving results...")
visualize_clusters(clustered_data, X, final_kmeans)

# Save final results
try:
    clustered_data.to_csv('FirstName_LastName_Clustering_Results.csv', index=False)
    print("✓ Saved clustering results to FirstName_LastName_Clustering_Results.csv")
except Exception as e:
    print(f"× Error saving clustering results: {str(e)}")

try:
    metrics.to_csv('FirstName_LastName_Clustering_Metrics.csv', index=False)
    print("✓ Saved clustering metrics to FirstName_LastName_Clustering_Metrics.csv")
except Exception as e:
    print(f"× Error saving clustering metrics: {str(e)}")

print("\nClustering analysis completed!")

1. Preparing features for clustering...
✓ Prepared features for 200 customers

2. Performing clustering analysis...
✓ Evaluated clustering with 2 clusters
✓ Evaluated clustering with 3 clusters
✓ Evaluated clustering with 4 clusters
✓ Evaluated clustering with 5 clusters
✓ Evaluated clustering with 6 clusters
✓ Evaluated clustering with 7 clusters
✓ Evaluated clustering with 8 clusters
✓ Evaluated clustering with 9 clusters
✓ Evaluated clustering with 10 clusters

3. Clustering Results:
✓ Optimal number of clusters: 2
✓ Davies-Bouldin Index: 0.4487
✓ Silhouette Score: 0.6746
✓ Calinski-Harabasz Score: 759.2050

4. Cluster sizes:
Cluster
0     97
1    103
Name: count, dtype: int64

5. Creating visualizations and saving results...

Generating visualizations...
✓ Created PCA visualization (clustering_pca.png)
✓ Created cluster sizes visualization (clustering_sizes.png)
× Error creating characteristics heatmap: "Columns not found: 'monetary'"
× Error saving cluster insights: 'Column not fo