In [0]:
dbutils.fs.mount(
  source = "wasbs://image@june161.blob.core.windows.net/",
  mount_point = "/mnt/june161",
  extra_configs = {
    "fs.azure.account.key.june161.blob.core.windows.net": "wrRw321e+rDzAkRZAAEw3BhbPH1wyhLSjx/w0MAAiCxic0ff3l0dO4xYLHwDHvH78RVOyjxMs0Z8+AStqpBEIA=="
  }
)


True

In [0]:
subs_df = spark.read.option("header", True).option("inferSchema", True).csv("/mnt/june161/subscriptions.csv")
activity_df = spark.read.option("header", True).option("inferSchema", True).csv("/mnt/june161/user_activity.csv")


In [0]:
from pyspark.sql.functions import to_date, to_timestamp, count, datediff, col

# Convert date columns
subs_df = subs_df.withColumn("StartDate", to_date("StartDate")) \
                 .withColumn("EndDate", to_date("EndDate"))

activity_df = activity_df.withColumn("EventTime", to_timestamp("EventTime"))

# Compute active_days
subs_df = subs_df.withColumn("active_days", datediff("EndDate", "StartDate"))

# Events per user
event_counts = activity_df.groupBy("UserID").agg(count("*").alias("events_per_user"))

# Join + calculate score
engagement = subs_df.join(event_counts, on="UserID", how="left").fillna(0)
engagement = engagement.withColumn("engagement_score", 
                                   (col("events_per_user") / col("active_days")) * col("PriceUSD"))

# Show results
engagement.select("SubscriptionID", "UserID", "PlanType", "engagement_score").show()


+--------------+------+--------+------------------+
|SubscriptionID|UserID|PlanType|  engagement_score|
+--------------+------+--------+------------------+
|        SUB001|  U001|   Basic|0.6593406593406594|
|        SUB002|  U002|     Pro|               1.0|
|        SUB003|  U003|     Pro|0.9782608695652174|
|        SUB004|  U001| Premium|2.6373626373626378|
|        SUB005|  U004|   Basic|0.3296703296703297|
+--------------+------+--------+------------------+



In [0]:
subs_df.createOrReplaceTempView("subscriptions")
activity_df.createOrReplaceTempView("user_activity")


In [0]:
spark.sql("""
    SELECT DISTINCT s.UserID
    FROM subscriptions s
    JOIN user_activity a ON s.UserID = a.UserID
    WHERE s.IsActive = true
""").show()



+------+
|UserID|
+------+
|  U002|
|  U001|
+------+



In [0]:
spark.sql("""
    SELECT s.UserID, s.AutoRenew
    FROM subscriptions s
    LEFT JOIN (
        SELECT UserID, MAX(EventTime) as last_activity
        FROM user_activity
        GROUP BY UserID
    ) a ON s.UserID = a.UserID
    WHERE s.AutoRenew = true AND datediff(current_date(), last_activity) > 30
""").show()


+------+---------+
|UserID|AutoRenew|
+------+---------+
|  U001|     true|
|  U001|     true|
+------+---------+



In [0]:
subs_df.write.mode("overwrite").format("delta").save("/mnt/june161/delta_subs")


In [0]:
from delta.tables import DeltaTable
delta_subs = DeltaTable.forPath(spark, "/mnt/june161/delta_subs")


In [0]:
from pyspark.sql.functions import col, month, year
from delta.tables import DeltaTable

# Path to your Delta table
delta_path = "/mnt/june161/delta_subs"

# Step 1: Load Delta table as DataFrame
subs_df = spark.read.format("delta").load(delta_path)

# Step 2: Create update DataFrame for Pro plans started in March 2024
updates = subs_df.filter(
    (col("PlanType") == "Pro") & 
    (month("StartDate") == 3) & 
    (year("StartDate") == 2024)
).withColumn("PriceUSD", col("PriceUSD") + 5)

# Step 3: Load DeltaTable object for merge
delta_subs = DeltaTable.forPath(spark, delta_path)

# Step 4: Perform the merge
delta_subs.alias("target").merge(
    updates.alias("updates"),
    "target.SubscriptionID = updates.SubscriptionID"
).whenMatchedUpdate(set={
    "PriceUSD": "updates.PriceUSD"
}).execute()

# Step 5: Show updated Pro plans from March 2024
print("Updated Pro plan subscriptions for March 2024:")
spark.read.format("delta").load(delta_path) \
    .filter((col("PlanType") == "Pro") & 
            (month("StartDate") == 3) & 
            (year("StartDate") == 2024)) \
    .select("SubscriptionID", "UserID", "StartDate", "PlanType", "PriceUSD") \
    .show()



Updated Pro plan subscriptions for March 2024:
+--------------+------+----------+--------+--------+
|SubscriptionID|UserID| StartDate|PlanType|PriceUSD|
+--------------+------+----------+--------+--------+
|        SUB003|  U003|2024-03-10|     Pro|   100.0|
+--------------+------+----------+--------+--------+



In [0]:
spark.sql("DESCRIBE HISTORY delta.`/mnt/june161/delta_subs`").show(truncate=False)


+-------+-------------------+----------------+----------------------------------+---------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
old_df = spark.read.format("delta").option("versionAsOf", 0).load("/mnt/june161/delta_subs")
old_df.filter("PlanType = 'Pro' AND month(StartDate) = 3").select("SubscriptionID", "PriceUSD").show()


+--------------+--------+
|SubscriptionID|PriceUSD|
+--------------+--------+
|        SUB003|    90.0|
+--------------+--------+



In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

window_spec = Window.partitionBy("UserID").orderBy("StartDate")

migration_df = subs_df.withColumn("prev_plan", lag("PlanType").over(window_spec))
migration_df.filter((col("prev_plan") == "Basic") & (col("PlanType").isin("Pro", "Premium"))).show()


+--------------+------+--------+----------+----------+--------+--------+---------+-----------+---------+
|SubscriptionID|UserID|PlanType| StartDate|   EndDate|PriceUSD|IsActive|AutoRenew|active_days|prev_plan|
+--------------+------+--------+----------+----------+--------+--------+---------+-----------+---------+
|        SUB004|  U001| Premium|2024-04-05|2024-07-05|   120.0|    true|     true|         91|    Basic|
+--------------+------+--------+----------+----------+--------+--------+---------+-----------+---------+



In [0]:
from pyspark.sql.functions import countDistinct, sum, when

power_df = activity_df.withColumn("is_login", when(col("EventType") == "login", 1).otherwise(0)) \
    .groupBy("UserID") \
    .agg(
        countDistinct("FeatureUsed").alias("feature_count"),
        sum("is_login").alias("login_count")
    ).filter("feature_count >= 2 AND login_count >= 3")

# Save to Delta
power_df.write.mode("overwrite").format("delta").save("/mnt/june161/power_users")


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lead, when, unix_timestamp

# Filter only login/logout events
session_df = activity_df.filter(col("EventType").isin("login", "logout"))

# Define a window per user ordered by EventTime
user_window = Window.partitionBy("UserID").orderBy("EventTime")

# Use lead to get next event type and time
session_df = session_df.withColumn("next_event", lead("EventType").over(user_window)) \
                       .withColumn("next_time", lead("EventTime").over(user_window))

# Keep only login events where the next event is logout
login_sessions = session_df.filter((col("EventType") == "login") & (col("next_event") == "logout"))

# Compute session duration in minutes
login_sessions = login_sessions.withColumn(
    "session_duration_minutes",
    (unix_timestamp("next_time") - unix_timestamp("EventTime")) / 60
)

# Select meaningful columns
login_sessions.select(
    "UserID",
    col("EventTime").alias("login_time"),
    col("next_time").alias("logout_time"),
    "session_duration_minutes"
).show(truncate=False)



+------+-------------------+-------------------+------------------------+
|UserID|login_time         |logout_time        |session_duration_minutes|
+------+-------------------+-------------------+------------------------+
|U001  |2024-04-07 10:22:00|2024-04-10 16:00:00|4658.0                  |
+------+-------------------+-------------------+------------------------+

