In [152]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score, silhouette_score
import matplotlib.pyplot as plt
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle
from datetime import datetime
from reportlab.lib.units import inch

# Load data
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge customer and transaction data
customer_data = pd.merge(transactions, customers, on="CustomerID", how="inner")

# Feature Engineering
customer_agg = customer_data.groupby('CustomerID').agg(
    total_spent=pd.NamedAgg(column='TotalValue', aggfunc='sum'),
    num_transactions=pd.NamedAgg(column='TransactionID', aggfunc='nunique'),
    avg_spent_per_transaction=pd.NamedAgg(column='TotalValue', aggfunc='mean')
).reset_index()

# Normalize features
scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_agg[['total_spent', 'num_transactions', 'avg_spent_per_transaction']])

# Clustering
best_kmeans = None
best_n_clusters = 0
best_db_index = float('inf')
best_silhouette_score = -1
db_indexes = []
silhouette_scores = []

for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(customer_data_scaled)
    db_index = davies_bouldin_score(customer_data_scaled, kmeans.labels_)
    silhouette_avg = silhouette_score(customer_data_scaled, kmeans.labels_)
    db_indexes.append(db_index)
    silhouette_scores.append(silhouette_avg)
    if db_index < best_db_index:
        best_db_index = db_index
        best_silhouette_score = silhouette_avg
        best_kmeans = kmeans
        best_n_clusters = n_clusters

# Visualize clusters
def visualize_clusters(customer_data_scaled, kmeans, n_clusters):
    plt.figure(figsize=(10, 6))
    plt.scatter(customer_data_scaled[:, 0], customer_data_scaled[:, 1], c=kmeans.labels_, cmap='viridis', s=50, alpha=0.7)
    plt.title(f'Customer Segmentation (KMeans, {n_clusters} clusters)', fontsize=16)
    plt.xlabel('Total Spent (scaled)')
    plt.ylabel('Number of Transactions (scaled)')
    plt.colorbar(label='Cluster')
    plt.savefig('clusters_plot.png', dpi=300)
    plt.close()

visualize_clusters(customer_data_scaled, best_kmeans, best_n_clusters)

# Cluster profile descriptions
def get_cluster_profiles(kmeans, customer_agg):
    customer_agg['Cluster'] = kmeans.labels_
    cluster_profiles = customer_agg.groupby('Cluster').agg(
        avg_total_spent=pd.NamedAgg(column='total_spent', aggfunc='mean'),
        avg_num_transactions=pd.NamedAgg(column='num_transactions', aggfunc='mean'),
        avg_spent_per_transaction=pd.NamedAgg(column='avg_spent_per_transaction', aggfunc='mean')
    ).reset_index()
    
    return cluster_profiles

# Get cluster profiles
cluster_profiles = get_cluster_profiles(best_kmeans, customer_agg)

# Create PDF report with cluster profiles
def create_clustering_report_with_profiles(output_path):
    doc = SimpleDocTemplate(output_path, pagesize=letter)
    elements = []
    styles = getSampleStyleSheet()

    # Adjust alignment for different sections
    styles.add(ParagraphStyle(name='CustomTitle', fontName='Helvetica-Bold', fontSize=24, alignment=1, textColor=colors.darkblue))  # Center alignment
    styles.add(ParagraphStyle(name='CustomBodyText', fontSize=12, leading=14, spaceAfter=20, alignment=0))  # Left alignment
    styles.add(ParagraphStyle(name='Subheading', fontName='Helvetica-Bold', fontSize=18, textColor=colors.darkblue, alignment=0))  # Left alignment
    styles.add(ParagraphStyle(name='JustifiedBodyText', fontSize=12, leading=14, spaceAfter=20, alignment=4))  # Justified alignment

    # Title Page
    # Add space before the title
    elements.append(Paragraph("Customer Segmentation Report:", styles['CustomTitle']))
    elements.append(Spacer(1, 0.2 * inch))  # Space between the lines
    elements.append(Paragraph("Insights from Clustering Analysis", styles['CustomTitle']))
    elements.append(Spacer(1, 0.8 * inch))  # Add space after the title
    elements.append(Paragraph("Project Overview: ", styles['Subheading']))
    elements.append(Spacer(1, 0.4 * inch))
    elements.append(Paragraph(
    "A Comprehensive Analysis of Customer Behavior Using KMeans Clustering. "
    "This report provides an in-depth examination of customer segmentation patterns based on their transaction history, "
    "spending behavior, and frequency of interactions. By leveraging the power of KMeans clustering, we group customers "
    "into distinct segments, each with unique characteristics. These insights allow businesses to personalize their offerings, "
    "target specific customer needs, and ultimately improve engagement, retention, and profitability. The analysis uncovers key trends "
    "in customer behavior, offering valuable recommendations for data-driven decision-making.", styles['JustifiedBodyText']
    ))
    elements.append(Spacer(1, 0.4 * inch))  # Slightly larger space for separation
    elements.append(Paragraph("Prepared By: Kavitha L", styles['CustomBodyText']))
    elements.append(Spacer(1, 0.2 * inch))  # Consistent spacing
    elements.append(Paragraph("Email: kavithaofficial0301@gmail.com", styles['CustomBodyText']))
    elements.append(Spacer(1, 0.2 * inch))  # Space between email and phone number
    elements.append(Paragraph("Phone: +91 8531973226", styles['CustomBodyText']))
    elements.append(Spacer(1, 0.4 * inch))  # Consistent spacing after contact details
    elements.append(Paragraph("Purpose:", styles['Subheading']))
    elements.append(Spacer(1, 0.4 * inch))  # Add space before the paragraph
    elements.append(Paragraph(
    "This report aims to provide actionable insights by segmenting customers based on their purchasing patterns, "
    "transaction frequency, and spending behavior. The clustering analysis helps businesses understand their customers "
    "better and tailor strategies for improved engagement and profitability.", 
    styles['JustifiedBodyText']
    ))
    elements.append(Spacer(1, 0.4 * inch))  # More spacing for separation
    elements.append(PageBreak())
    elements.append(Paragraph("Key Highlights:", styles['Subheading']))
    elements.append(Spacer(1, 0.4 * inch))  # Add a little space before the bullet points
    elements.append(Paragraph(f"- Number of Clusters Formed: {best_n_clusters}", styles['CustomBodyText']))
    elements.append(Paragraph(f"- DB Index Value: {best_db_index:.4f}", styles['CustomBodyText']))
    elements.append(Paragraph(f"- Silhouette Score: {best_silhouette_score:.4f}", styles['CustomBodyText']))
    elements.append(Paragraph(
    "- Visualization Included: Comprehensive plots for cluster evaluation and segmentation insights.", 
    styles['CustomBodyText']
    ))
    elements.append(Spacer(1, 0.4 * inch)) 
    # Overview Section
    elements.append(Paragraph("Clustering Overview", styles['Subheading']))
    elements.append(Spacer(1, 0.4 * inch))  # Add space after the heading
    elements.append(Paragraph(f"Number of Clusters: {best_n_clusters}", styles['CustomBodyText']))
    elements.append(Paragraph(f"DB Index Value: {best_db_index:.4f}", styles['CustomBodyText']))
    elements.append(Paragraph(f"Silhouette Score: {best_silhouette_score:.4f}", styles['CustomBodyText']))

    # Cluster Profiles Table (Merged with Overview)
    elements.append(Spacer(1, 0.5 * inch))  # Add space before the table
    elements.append(Paragraph("Customer Cluster Profiles", styles['Subheading']))
    elements.append(Spacer(1, 0.4 * inch))
    cluster_data = [
        ['Cluster', 'Average Total Spent', 'Average Number of Transactions', 'Average Spend per Transaction']
    ]
    
    for idx, row in cluster_profiles.iterrows():
        cluster_data.append([
            f"Cluster {row['Cluster']}",
            f"${row['avg_total_spent']:.2f}",
            f"{row['avg_num_transactions']:.2f}",
            f"${row['avg_spent_per_transaction']:.2f}"
        ])

    cluster_table = Table(cluster_data)
    cluster_table.setStyle(TableStyle([
        ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
        ('BACKGROUND', (0, 0), (-1, 0), colors.lightblue),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
        ('BACKGROUND', (0, 1), (-1, -1), colors.whitesmoke),
    ]))

    elements.append(cluster_table)
    elements.append(PageBreak())

    # Metric Plots
    plt.figure(figsize=(10, 6))
    plt.plot(range(2, 11), db_indexes, marker='o', linestyle='-', color='b')
    plt.title('DB Index vs Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('DB Index')
    plt.grid(True)
    plt.savefig('db_index_plot.png', dpi=300)
    plt.close()

    # DB Index Plot
    elements.append(Paragraph("DB Index Plot", styles['Subheading']))
    elements.append(Spacer(1, 0.2 * inch))  # Add space before the image
    elements.append(Image('db_index_plot.png', width=5*inch, height=4*inch, hAlign='CENTER'))  # Center alignment for images

    plt.figure(figsize=(10, 6))
    plt.plot(range(2, 11), silhouette_scores, marker='o', linestyle='-', color='r')
    plt.title('Silhouette Score vs Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.savefig('silhouette_score_plot.png', dpi=300)
    plt.close()

    elements.append(Paragraph("Silhouette Score Plot", styles['Subheading']))
    elements.append(Spacer(1, 0.2 * inch))  # Add space before the image
    elements.append(Image('silhouette_score_plot.png', width=5*inch, height=4*inch, hAlign='CENTER'))  # Center alignment for images
    elements.append(PageBreak())

    # Clustering Results Plot
    elements.append(Paragraph("Clustering Result Plot", styles['Subheading']))
    elements.append(Spacer(1, 0.2 * inch))  # Add space before the image
    elements.append(Image('clusters_plot.png', width=5*inch, height=4*inch, hAlign='CENTER'))  # Center alignment for images
    # Conclusion Heading
    elements.append(Paragraph("Conclusion", styles['Subheading']))
    elements.append(Spacer(1, 0.4 * inch))  # Add space after the heading

    # Conclusion Text
    elements.append(Paragraph(
    "This report has provided an insightful analysis of customer behavior through segmentation using KMeans clustering. "
    "By grouping customers based on their spending patterns and transaction frequency, we were able to uncover distinct customer profiles. "
    "The clustering analysis, accompanied by metrics such as the DB index and Silhouette score, ensures that the model chosen is appropriate. "
    "These insights can be used to develop more personalized marketing strategies, optimize resource allocation, and improve customer engagement strategies. "
    "Moving forward, the clusters identified can also serve as the foundation for further analysis and prediction tasks, "
    "leading to more data-driven business decisions.", 
    styles['JustifiedBodyText']
    ))
    # Build PDF
    doc.build(elements)

# Save the report with cluster profiles
create_clustering_report_with_profiles(".pdf")


