# Pyspark setup

In [None]:
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, year
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("CustomerPurchaseAnalysis")\
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.901") \
    .config("spark.hadoop.fs.s3a.access.key", "ACCESS_KEY")\
    .config("spark.hadoop.fs.s3a.secret.key", "SECRET_KEY")\
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.metastore.metrics.enabled", "false") \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .config("spark.executor.memory", "4g")\
    .config("spark.driver.memory", "4g")\
    .config("spark.hadoop.fs.s3a.region", "ap-south-1") \
    .getOrCreate()

# Customer Purchase Analysis

In [53]:
from pyspark.sql.types import TimestampType

In [54]:

# Load data from s3
sales_input = "s3a://this-is-my-bucket007/sales_data.csv"
sales_df = spark.read.csv(sales_input, header=True, inferSchema=True)
sales_df = sales_df.withColumn("purchase_date", col("purchase_date").cast(TimestampType()))

In [55]:
# Load customer data from DynamoDB
response = table.scan()
cust_data = response['Items']

In [56]:
# Convert the DynamoDB items to DataFrame
cust_df = spark.createDataFrame(cust_data)

In [61]:
# Join DataFrames on customer_id, assuming 'id' is the customer ID column in cust_df
joined_df = sales_df.join(cust_df, sales_df["customer_id"] == cust_df["id"], how="inner")

In [63]:
from pyspark.sql.window import Window
# Define window partitioned by customer_id and ordered by purchase_date
window_spec = Window.partitionBy("customer_id").orderBy("purchase_date")

In [65]:
from pyspark.sql.functions import col, lag, avg, unix_timestamp

In [66]:

# Calculate time difference between each transaction (in seconds)
interval_df = joined_df.withColumn(
    "previous_purchase_date", lag("purchase_date").over(window_spec)
).withColumn(
    "purchase_interval",
    (unix_timestamp("purchase_date") - unix_timestamp("previous_purchase_date"))
)

# Calculate average transaction interval per customer (in seconds)
avg_interval_df = interval_df.groupBy("customer_id").agg(
    avg("purchase_interval").alias("avg_purchase_interval_seconds")
)

# Convert seconds to days for easier interpretation
avg_interval_df = avg_interval_df.withColumn(
    "avg_purchase_interval_days", col("avg_purchase_interval_seconds") / 86400
)

# Identify high-engagement customers (e.g., customers with average interval < 30 days)
high_engagement_df = avg_interval_df.filter(col("avg_purchase_interval_days") < 30)
high_engagement_df.show()

print("Customer purchase interval analysis completed.")

+-----------+-----------------------------+--------------------------+
|customer_id|avg_purchase_interval_seconds|avg_purchase_interval_days|
+-----------+-----------------------------+--------------------------+
+-----------+-----------------------------+--------------------------+

Customer purchase interval analysis completed.


In [None]:
spark.stop()