In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, to_date, month, year, sum as _sum, avg, stddev, collect_list
)

# 1. Create Spark session
spark = SparkSession.builder.appName("ExpenseAnalysis").getOrCreate()

# 2. Load CSV
df = spark.read.option("header", True).option("inferSchema", True).csv("/content/cleaned_expenses (1).csv")
print("=== Original Data ===")
df.show()

# 3. Convert ExpenseDate to proper date format
df = df.withColumn("date", to_date("ExpenseDate", "yyyy-MM-dd"))

# 4. Extract month and year (if Month column already exists, you can skip creating 'month')
df = df.withColumn("month", month("date")).withColumn("year", year("date"))

print("=== Data with Month & Year ===")
df.show()

# 5. Monthly spend per user
monthly_spend = df.groupBy("UserID", "year", "month") \
                  .agg(_sum("Amount").alias("monthly_total"))
print("=== Monthly Spend by User ===")
monthly_spend.orderBy("UserID", "year", "month").show()

# 6. Average and std deviation per user
stats = df.groupBy("UserID").agg(
    avg("Amount").alias("avg_spend"),
    stddev("Amount").alias("std_spend")
)
print("=== Average & Std Dev Spend per User ===")
stats.show()

# 7. Join stats with original data
df_with_stats = df.join(stats, on="UserID")

# 8. Detect anomalies: amount > avg + 2*std
anomalies = df_with_stats.filter(col("Amount") > (col("avg_spend") + 2 * col("std_spend")))
print("=== Unusual High Expenses Detected ===")
anomalies.select("UserID", "date", "Amount", "Category", "avg_spend", "std_spend").show()

# 9. Total spend per user
user_total_spend = df.groupBy("UserID").agg(_sum("Amount").alias("total_spend"))
print("=== Total Spend by User ===")
user_total_spend.orderBy("total_spend", ascending=False).show()

# 10. Pivot: Spend by category per user
pivot_df = df.groupBy("UserID").pivot("Category").agg(_sum("Amount"))
print("=== Spend by Category per User ===")
pivot_df.show()

# 11. Average monthly spend by category
avg_monthly_category = df.groupBy("year", "month", "Category") \
                         .agg(avg("Amount").alias("avg_spend"))
print("=== Average Monthly Spend per Category ===")
avg_monthly_category.orderBy("year", "month", "Category").show()

# 12. Detect missing months for each user
all_months = set(range(1, 13))
user_months = df.select("UserID", "month").distinct() \
                .groupBy("UserID") \
                .agg(collect_list("month").alias("months_present"))

# Calculate missing months
user_month_data = user_months.collect()
missing_months_data = [
    (row["UserID"], sorted(list(all_months - set(row["months_present"]))))
    for row in user_month_data
]
missing_months_df = spark.createDataFrame(missing_months_data, ["UserID", "missing_months"])

print("=== Missing Months per User ===")
missing_months_df.orderBy("UserID").show(truncate=False)


=== Original Data ===
+----------+------+----------+------+-----------+--------------------+-------------+-------------------+
|expense_id|UserID|CategoryID|Amount|ExpenseDate|         Description|     Category|              Month|
+----------+------+----------+------+-----------+--------------------+-------------+-------------------+
|         1|     1|         1|  1200| 01-07-2025|Monthly grocery s...|    Groceries|2025-07-01 00:00:00|
|         2|     1|         2|   300| 02-07-2025|            Bus pass|    Transport|2025-07-01 00:00:00|
|         3|     1|         3|   500| 10-07-2025|         Movie night|Entertainment|2025-07-01 00:00:00|
|         4|     2|         1|   900| 03-07-2025|             Grocery|    Groceries|2025-07-01 00:00:00|
|         5|     2|         4|  1100| 05-07-2025|    Electricity bill|    Utilities|2025-07-01 00:00:00|
|         7|     3|         2|  4490| 12-07-2025|    Clothes purchase|     Shopping|2025-07-01 00:00:00|
|         8|     4|        10|  4