In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.appName("CustomerAnalytics").getOrCreate()
print("‚úÖ Spark session initialized")

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.appName("CustomerAnalytics").getOrCreate()
print("‚úÖ Spark session initialized")

## 1Ô∏è‚É£ Load Data from Lakehouse

In [None]:
# Load sales transactions from lakehouse
df_sales = spark.read.format("delta").load("Tables/sales_transactions")

# Load customer data
df_customers = spark.read.format("delta").load("Tables/customers")

# Load product catalog
df_products = spark.read.format("delta").load("Tables/products")

print(f"üìà Sales records: {df_sales.count():,}")
print(f"üë• Customer records: {df_customers.count():,}")
print(f"üì¶ Product records: {df_products.count():,}")

## 2Ô∏è‚É£ Customer Segmentation Analysis

In [None]:
# Calculate customer metrics for RFM segmentation
customer_metrics = df_sales.groupBy("customer_id").agg(
    count("transaction_id").alias("frequency"),
    sum("total_amount").alias("monetary_value"),
    max("transaction_date").alias("last_purchase_date"),
    min("transaction_date").alias("first_purchase_date")
)

# Calculate recency (days since last purchase)
customer_metrics = customer_metrics.withColumn(
    "recency_days",
    datediff(current_date(), col("last_purchase_date"))
)

# Join with customer demographic data
customer_360 = customer_metrics.join(
    df_customers.select("customer_id", "customer_segment", "region", "lifetime_value"),
    on="customer_id",
    how="left"
)

customer_360.show(10)

## 3Ô∏è‚É£ Revenue Analysis by Category

In [None]:
# Revenue breakdown by product category
revenue_by_category = df_sales.join(
    df_products.select("product_name", "category"),
    on="product_name",
    how="left"
).groupBy("category").agg(
    sum("total_amount").alias("total_revenue"),
    count("transaction_id").alias("transaction_count"),
    avg("total_amount").alias("avg_transaction_value")
).orderBy(desc("total_revenue"))

print("üí∞ Revenue by Category:")
revenue_by_category.show()

## 4Ô∏è‚É£ Geographic Distribution

In [None]:
# Sales by region
regional_sales = df_sales.groupBy("region").agg(
    sum("total_amount").alias("total_revenue"),
    countDistinct("customer_id").alias("unique_customers"),
    count("transaction_id").alias("total_transactions")
).withColumn(
    "revenue_per_customer",
    round(col("total_revenue") / col("unique_customers"), 2)
).orderBy(desc("total_revenue"))

# Convert to pandas for visualization
regional_pd = regional_sales.toPandas()

# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Revenue by region
axes[0].barh(regional_pd['region'], regional_pd['total_revenue'])
axes[0].set_xlabel('Total Revenue ($)')
axes[0].set_title('Revenue by Region')

# Customers by region
axes[1].pie(regional_pd['unique_customers'], labels=regional_pd['region'], autopct='%1.1f%%')
axes[1].set_title('Customer Distribution')

plt.tight_layout()
plt.show()

## 5Ô∏è‚É£ Save Results to Lakehouse

In [None]:
# Save customer 360 view to lakehouse
customer_360.write.format("delta").mode("overwrite").save("Tables/customer_360_view")

# Save regional analysis
regional_sales.write.format("delta").mode("overwrite").save("Tables/regional_analysis")

print("‚úÖ Analysis results saved to lakehouse!")
print("üìä Tables created:")
print("   - customer_360_view")
print("   - regional_analysis")

## 1Ô∏è‚É£ Load Data from Lakehouse

In [None]:
# Load sales transactions from lakehouse
df_sales = spark.read.format("delta").load("Tables/sales_transactions")

# Load customer data
df_customers = spark.read.format("delta").load("Tables/customers")

# Load product catalog
df_products = spark.read.format("delta").load("Tables/products")

print(f"üìà Sales records: {df_sales.count():,}")
print(f"üë• Customer records: {df_customers.count():,}")
print(f"üì¶ Product records: {df_products.count():,}")

## 2Ô∏è‚É£ Customer Segmentation Analysis

In [None]:
# Calculate customer metrics for RFM segmentation
customer_metrics = df_sales.groupBy("customer_id").agg(
    count("transaction_id").alias("frequency"),
    sum("total_amount").alias("monetary_value"),
    max("transaction_date").alias("last_purchase_date"),
    min("transaction_date").alias("first_purchase_date")
)

# Calculate recency (days since last purchase)
customer_metrics = customer_metrics.withColumn(
    "recency_days",
    datediff(current_date(), col("last_purchase_date"))
)

# Join with customer demographic data
customer_360 = customer_metrics.join(
    df_customers.select("customer_id", "customer_segment", "region", "lifetime_value"),
    on="customer_id",
    how="left"
)

customer_360.show(10)

## 3Ô∏è‚É£ Revenue Analysis by Category

In [None]:
# Revenue breakdown by product category
revenue_by_category = df_sales.join(
    df_products.select("product_name", "category"),
    on="product_name",
    how="left"
).groupBy("category").agg(
    sum("total_amount").alias("total_revenue"),
    count("transaction_id").alias("transaction_count"),
    avg("total_amount").alias("avg_transaction_value")
).orderBy(desc("total_revenue"))

print("üí∞ Revenue by Category:")
revenue_by_category.show()

## 4Ô∏è‚É£ Geographic Distribution

In [None]:
# Sales by region
regional_sales = df_sales.groupBy("region").agg(
    sum("total_amount").alias("total_revenue"),
    countDistinct("customer_id").alias("unique_customers"),
    count("transaction_id").alias("total_transactions")
).withColumn(
    "revenue_per_customer",
    round(col("total_revenue") / col("unique_customers"), 2)
).orderBy(desc("total_revenue"))

# Convert to pandas for visualization
regional_pd = regional_sales.toPandas()

# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Revenue by region
axes[0].barh(regional_pd['region'], regional_pd['total_revenue'])
axes[0].set_xlabel('Total Revenue ($)')
axes[0].set_title('Revenue by Region')

# Customers by region
axes[1].pie(regional_pd['unique_customers'], labels=regional_pd['region'], autopct='%1.1f%%')
axes[1].set_title('Customer Distribution')

plt.tight_layout()
plt.show()

## 5Ô∏è‚É£ Save Results to Lakehouse

In [None]:
# Save customer 360 view to lakehouse
customer_360.write.format("delta").mode("overwrite").save("Tables/customer_360_view")

# Save regional analysis
regional_sales.write.format("delta").mode("overwrite").save("Tables/regional_analysis")

print("‚úÖ Analysis results saved to lakehouse!")
print("üìä Tables created:")
print("   - customer_360_view")
print("   - regional_analysis")