### 🛠Ô∏è Day 3 Tasks:

1. Load full e-commerce dataset
2. Perform complex joins
3. Calculate running totals with window functions
4. Create derived features

## Task 1 :  Load full e-commerce dataset

In [0]:
Oct_events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    header=True,
    inferSchema=True
)

Nov_events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)


Combining (Union) both months

In [0]:
full_events = Oct_events.unionByName(Nov_events)


Validating

In [0]:
full_events.count()



109950743

In [0]:
full_events.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



## Task 2: Perform Complex Joins

In [0]:
from pyspark.sql.functions import sum, count

# Create user-level summary
user_summary = full_events.groupBy("user_id").agg(
    count("*").alias("total_events"),
    sum("price").alias("total_spent")
)

# Join with main dataset
joined_df = full_events.join(
    user_summary,
    on="user_id",
    how="left"
)


Validation

In [0]:
joined_df.select("user_id", "price", "total_spent").show(5)


+---------+------+------------------+
|  user_id| price|       total_spent|
+---------+------+------------------+
|512647474|226.26|149813.59000000003|
|515547719|205.38|10898.180000000002|
|554748717|  33.2|            277.56|
|512515208|142.29|157677.28000000003|
|514109745| 24.04|         121634.49|
+---------+------+------------------+
only showing top 5 rows


## Task 3: Running Totals with Window Functions

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum

# Define window specification
window_spec = Window.partitionBy("user_id") \
    .orderBy("event_time") \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Calculate running total
running_total_df = joined_df.withColumn(
    "running_total_spent",
    sum("price").over(window_spec)
)


Validation

In [0]:
running_total_df.select(
    "user_id", "event_time", "price", "running_total_spent"
).show(5)


+---------+-------------------+------+-------------------+
|  user_id|         event_time| price|running_total_spent|
+---------+-------------------+------+-------------------+
| 65800726|2019-11-27 04:33:16|  81.8|               81.8|
| 65800726|2019-11-27 04:35:24|  81.8|              163.6|
| 81255481|2019-11-08 07:44:45| 66.35|              66.35|
| 81255481|2019-11-21 14:11:26| 66.14|             132.49|
|106416780|2019-11-28 05:43:46|256.53|             256.53|
+---------+-------------------+------+-------------------+
only showing top 5 rows


## Task 4: Create Derived Features

High-value Transaction Flag

In [0]:
from pyspark.sql.functions import when

feature_df = running_total_df.withColumn(
    "high_value_transaction",
    when(running_total_df.price > 100, 1).otherwise(0)
)


User Spending Category

In [0]:
feature_df = feature_df.withColumn(
    "user_spending_category",
    when(feature_df.total_spent > 1000, "High")
    .when(feature_df.total_spent > 500, "Medium")
    .otherwise("Low")
)


Event Date Extraction

In [0]:
from pyspark.sql.functions import to_date

feature_df = feature_df.withColumn(
    "event_date",
    to_date("event_time")
)


Final Validation

In [0]:
feature_df.select(
    "user_id",
    "price",
    "running_total_spent",
    "high_value_transaction",
    "user_spending_category",
    "event_date"
).show(5)


+---------+------+-------------------+----------------------+----------------------+----------+
|  user_id| price|running_total_spent|high_value_transaction|user_spending_category|event_date|
+---------+------+-------------------+----------------------+----------------------+----------+
| 65800726|  81.8|               81.8|                     0|                   Low|2019-11-27|
| 65800726|  81.8|              163.6|                     0|                   Low|2019-11-27|
| 81255481| 66.35|              66.35|                     0|                   Low|2019-11-08|
| 81255481| 66.14|             132.49|                     0|                   Low|2019-11-21|
|106416780|256.53|             256.53|                     1|                  High|2019-11-28|
+---------+------+-------------------+----------------------+----------------------+----------+
only showing top 5 rows
