In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, month, year, sum as spark_sum, avg, stddev, when

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("TransactionVolumeAnalysis").getOrCreate()

# Step 2: Load Data
df = spark.read.csv("/content/drive/My Drive/transactions.csv", header=True, inferSchema=True)

# Step 3: Preprocess
df = df.withColumn("TransactionDate", to_date(col("TransactionDate"), "yyyy-MM-dd")) \
       .withColumn("Year", year("TransactionDate")) \
       .withColumn("Month", month("TransactionDate"))

# Step 4: Monthly spend
monthly_spend = df.groupBy("UserID", "Year", "Month") \
    .agg(spark_sum("Amount").alias("TotalMonthlySpend"))

# Step 5: Per-user stats
user_stats = monthly_spend.groupBy("UserID") \
    .agg(
        avg("TotalMonthlySpend").alias("AvgSpend"),
        stddev("TotalMonthlySpend").alias("StdDevSpend")
    )

# Step 6: Join monthly spend with user stats
joined_df = monthly_spend.join(user_stats, on="UserID")

# Step 7: Handle null stddev and apply fallback logic
# If StdDev is null, fallback to hardcoded high threshold
anomalies = joined_df.withColumn(
    "UnusualSpending",
    when(
        col("StdDevSpend").isNotNull(),
        col("TotalMonthlySpend") > (col("AvgSpend") + 2 * col("StdDevSpend"))
    ).otherwise(
        col("TotalMonthlySpend") > 5000  # fallback threshold
    )
).filter("UnusualSpending = true")

# Step 8: Show anomalies
print("=== Users with Unusual Spending ===")
anomalies.select("UserID", "Year", "Month", "TotalMonthlySpend", "AvgSpend", "StdDevSpend").show()

# Step 9: Stop session
spark.stop()


=== Users with Unusual Spending ===
+------+----+-----+-----------------+--------+-----------+
|UserID|Year|Month|TotalMonthlySpend|AvgSpend|StdDevSpend|
+------+----+-----+-----------------+--------+-----------+
|  U003|2025|    2|           9000.0|  9000.0|       NULL|
+------+----+-----+-----------------+--------+-----------+

