In [1]:
!pip install -q pyspark

In [2]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, month, year

In [4]:
spark = SparkSession.builder.appName("ETL_Project").getOrCreate()

In [7]:
user_path = '/content/drive/MyDrive/user.csv'
expense_path = '/content/drive/MyDrive/expense.csv'
df_user = spark.read.csv(user_path, header=True, inferSchema=True)
df_expense = spark.read.csv(expense_path, header=True, inferSchema=True)
df_user.printSchema()
df_expense.printSchema()



root
 |-- user_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- income: integer (nullable = true)

root
 |-- expense_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- date: date (nullable = true)



In [8]:
df_joined = df_expense.join(df_user, on='user_id', how='inner')

print("Joined Data:")
df_joined.printSchema()
df_joined.show()

Joined Data:
root
 |-- user_id: integer (nullable = true)
 |-- expense_id: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- name: string (nullable = true)
 |-- income: integer (nullable = true)

+-------+----------+-------------+------+----------+-------+------+
|user_id|expense_id|     category|amount|      date|   name|income|
+-------+----------+-------------+------+----------+-------+------+
|      1|       101|         Food| 12000|2024-07-05|  Alice| 50000|
|      1|       102|       Travel|  5000|2024-07-20|  Alice| 50000|
|      2|       103|         Food| 10000|2024-07-10|    Bob| 40000|
|      2|       104|     Shopping|  8000|2024-07-21|    Bob| 40000|
|      3|       105|       Travel| 15000|2024-07-01|Charlie| 30000|
|      3|       106|Entertainment|  5000|2024-07-15|Charlie| 30000|
+-------+----------+-------------+------+----------+-------+------+



In [11]:

from pyspark.sql.functions import date_format

df_with_month = df_joined.withColumn("month", date_format("date", "yyyy-MM"))

print("Data with Month Column:")
df_with_month.show()


Data with Month Column:
+-------+----------+-------------+------+----------+-------+------+-------+
|user_id|expense_id|     category|amount|      date|   name|income|  month|
+-------+----------+-------------+------+----------+-------+------+-------+
|      1|       101|         Food| 12000|2024-07-05|  Alice| 50000|2024-07|
|      1|       102|       Travel|  5000|2024-07-20|  Alice| 50000|2024-07|
|      2|       103|         Food| 10000|2024-07-10|    Bob| 40000|2024-07|
|      2|       104|     Shopping|  8000|2024-07-21|    Bob| 40000|2024-07|
|      3|       105|       Travel| 15000|2024-07-01|Charlie| 30000|2024-07|
|      3|       106|Entertainment|  5000|2024-07-15|Charlie| 30000|2024-07|
+-------+----------+-------------+------+----------+-------+------+-------+



In [12]:
df_summary = df_with_month.groupBy("user_id", "name", "income", "month") \
    .agg(sum("amount").alias("total_spend"))

print("Monthly Summary (total spend):")
df_summary.show()

Monthly Summary (total spend):
+-------+-------+------+-------+-----------+
|user_id|   name|income|  month|total_spend|
+-------+-------+------+-------+-----------+
|      3|Charlie| 30000|2024-07|      20000|
|      2|    Bob| 40000|2024-07|      18000|
|      1|  Alice| 50000|2024-07|      17000|
+-------+-------+------+-------+-----------+



In [14]:
df_final = df_summary.withColumn("savings", col("income") - col("total_spend")) \
                     .withColumn("alert", col("total_spend") > col("income"))

print("Final Data with Savings and Alerts:")
df_final.show()

Final Data with Savings and Alerts:
+-------+-------+------+-------+-----------+-------+-----+
|user_id|   name|income|  month|total_spend|savings|alert|
+-------+-------+------+-------+-----------+-------+-----+
|      3|Charlie| 30000|2024-07|      20000|  10000|false|
|      2|    Bob| 40000|2024-07|      18000|  22000|false|
|      1|  Alice| 50000|2024-07|      17000|  33000|false|
+-------+-------+------+-------+-----------+-------+-----+



In [15]:
output_path = "/content/drive/MyDrive/monthly_summary"
df_final.coalesce(1).write.option("header", True).mode("overwrite").csv(output_path)

print(f" File successfully written to: {output_path}")

 File successfully written to: /content/drive/MyDrive/monthly_summary
