In [0]:
jdbc_url = "jdbc:mysql://<your_server_name>.mysql.database.azure.com:3306/<your_database_name>"
connection_properties = {
  "user": "your_username",
  "password": "your_password",
  "driver": "com.mysql.cj.jdbc.Driver"
}

df_users = spark.read.jdbc(url=jdbc_url, table="<your_table_name>", properties=connection_properties)
df_users.show()

+---+-------+------------------+--------------------+
| id|   name|           address|               email|
+---+-------+------------------+--------------------+
|  1|  Alice|   123 Main St, NY|alice.johnson@exa...|
|  2|    Bob|  456 Park Ave, CA|bob.smith@example...|
|  3|Charlie|    789 Oak St, TX|charlie.brown@exa...|
|  4|  David|    321 Elm St, FL|david.williams@ex...|
|  5|   Emma|   654 Pine St, WA|emma.davis@exampl...|
|  6|  Frank|  987 Maple St, CO|frank.miller@exam...|
|  7|  Grace|  246 Birch St, IL|grace.wilson@exam...|
|  8| Hannah|  135 Cedar St, NV|hannah.moore@exam...|
|  9|    Ian|  579 Aspen St, AZ|ian.taylor@exampl...|
| 10|   Jack|864 Redwood St, NJ|jack.anderson@exa...|
+---+-------+------------------+--------------------+



In [0]:
df_products = spark.read.jdbc(url=jdbc_url, table="products", properties=connection_properties)
df_orders = spark.read.jdbc(url=jdbc_url, table="orders", properties=connection_properties)
df_orderDetails = spark.read.jdbc(url=jdbc_url, table="orderDetails", properties=connection_properties)
df_userActivity = spark.read.jdbc(url=jdbc_url, table="userActivity", properties=connection_properties)
df_products.show()
df_orders.show()
df_orderDetails.show()
df_userActivity.show()

+---+------------+--------------------+-------+
| id|        name|         description|  price|
+---+------------+--------------------+-------+
|101|      Laptop|15-inch gaming la...|1200.00|
|102|  Smartphone|Flagship 5G smart...| 999.00|
|103|  Headphones|Noise-canceling w...| 250.00|
|104|  Smartwatch|Waterproof fitnes...| 180.00|
|105|      Tablet|10-inch Android t...| 350.00|
|106|     Monitor|27-inch 4K UHD di...| 450.00|
|107|    Keyboard|Mechanical RGB ga...| 120.00|
|108|       Mouse|Wireless ergonomi...|  80.00|
|109|     Printer|All-in-one color ...| 200.00|
|110|External HDD|2TB portable hard...| 100.00|
+---+------------+--------------------+-------+

+----+-------+---------+----------+
|  id|user_id|total_amt|order_date|
+----+-------+---------+----------+
|1001|      1|  1250.00|2024-01-01|
|1002|      2|  1999.00|2024-01-02|
|1003|      3|   250.00|2024-01-03|
|1004|      4|   450.00|2024-01-04|
|1005|      5|   180.00|2024-01-05|
|1006|      6|   999.00|2024-01-06|
|10

In [0]:
df_users.createOrReplaceTempView("users")
df_products.createOrReplaceTempView("products")
df_orders.createOrReplaceTempView("orders")
df_orderDetails.createOrReplaceTempView("orderDetails")
df_userActivity.createOrReplaceTempView("userActivity")

In [0]:
# 1. Identify Users Who Added to Cart but Didn't Purchase
# Business Need: Find users who have added a product to the cart but never completed a purchase.
# Insight: Helps in retargeting users who abandoned their carts.

# The tables that can answer the above question are : users and userActivity
# 1st Method

output1 = spark.sql("""
select u.id, u.name, u.email
from users u
join
    (select distinct ua1.user_id
from userActivity ua1
join userActivity ua2
where
ua1.user_id = ua2.user_id
and ua1.product_id = ua2.product_id
and ua1.activity_type = 'add_to_cart'
and ua2.activity_type != 'purchase'
    )temp
on u.id = temp.user_id""")

output1.show(truncate=False)

+---+-------+-------------------------+
|id |name   |email                    |
+---+-------+-------------------------+
|1  |Alice  |alice.johnson@example.com|
|3  |Charlie|charlie.brown@example.com|
|5  |Emma   |emma.davis@example.com   |
+---+-------+-------------------------+



In [0]:
# 2nd Method
output2 = spark.sql("""select u.id, u.name, u.email
from users u
join(
SELECT DISTINCT ua.user_id
FROM userActivity ua
WHERE ua.activity_type = 'add_to_cart'
AND NOT EXISTS (
    SELECT 1 FROM userActivity ua2
    WHERE ua2.user_id = ua.user_id
    AND ua2.product_id = ua.product_id
    AND ua2.activity_type = 'purchase'
))temp
on u.id = temp.user_id""")

output2.show(truncate=False)

+---+-------+-------------------------+
|id |name   |email                    |
+---+-------+-------------------------+
|1  |Alice  |alice.johnson@example.com|
|3  |Charlie|charlie.brown@example.com|
|5  |Emma   |emma.davis@example.com   |
+---+-------+-------------------------+



In [0]:
# 2. Find the Most Popular Products Based on User Engagement
# Business Need: Determine which products are the most viewed or added to carts before purchase.
# Insight: Helps in identifying high-engagement products for promotions.

# The tables that can answer the above question are : products and userActivity
# 1st Method (Using Subquery and count window function)

output3 = spark.sql("""
select p.id, p.name, p.description, temp.cnt as counts
from products p
join
( select distinct ua.product_id, count(ua.product_id) over (partition by ua.product_id) as cnt
from userActivity ua)temp
on p.id = temp.product_id
order by counts desc
limit 5""")

output3.show(truncate=False)

+---+----------+-----------------------------------+------+
|id |name      |description                        |counts|
+---+----------+-----------------------------------+------+
|102|Smartphone|Flagship 5G smartphone             |4     |
|101|Laptop    |15-inch gaming laptop              |2     |
|103|Headphones|Noise-canceling wireless headphones|2     |
|106|Monitor   |27-inch 4K UHD display             |1     |
|104|Smartwatch|Waterproof fitness tracker         |1     |
+---+----------+-----------------------------------+------+



In [0]:
# 2nd Method (Using count along with Case statement to bifurcate the user_activity)
output4 = spark.sql("""SELECT p.id, p.name,
       COUNT(CASE WHEN ua.activity_type = 'view' THEN 1 END) AS total_views,
       COUNT(CASE WHEN ua.activity_type = 'add_to_cart' THEN 1 END) AS total_adds,
       COUNT(CASE WHEN ua.activity_type = 'purchase' THEN 1 END) AS total_purchases
FROM products p
JOIN userActivity ua ON p.id = ua.product_id
GROUP BY p.id, p.name
ORDER BY total_purchases DESC, total_adds DESC, total_views DESC
LIMIT 5""")

output4.show(truncate=False)

+---+----------+-----------+----------+---------------+
|id |name      |total_views|total_adds|total_purchases|
+---+----------+-----------+----------+---------------+
|102|Smartphone|2          |0         |2              |
|101|Laptop    |1          |1         |0              |
|103|Headphones|1          |1         |0              |
|104|Smartwatch|0          |1         |0              |
|106|Monitor   |1          |0         |0              |
+---+----------+-----------+----------+---------------+



In [0]:
# 3rd Method (Using Sum-IF statement to bifurcate the user_activity)
output5 = spark.sql("""SELECT p.id, p.name,
       SUM(IF(ua.activity_type = 'view', 1, 0)) AS total_views,
       SUM(IF(ua.activity_type = 'add_to_cart', 1, 0)) AS total_adds,
       SUM(IF(ua.activity_type = 'purchase', 1, 0)) AS total_purchases
FROM products p
JOIN userActivity ua ON p.id = ua.product_id
GROUP BY p.id, p.name
ORDER BY total_purchases DESC, total_adds DESC, total_views DESC
LIMIT 5""")

output5.show(truncate=False)

+---+----------+-----------+----------+---------------+
|id |name      |total_views|total_adds|total_purchases|
+---+----------+-----------+----------+---------------+
|102|Smartphone|2          |0         |2              |
|101|Laptop    |1          |1         |0              |
|103|Headphones|1          |1         |0              |
|104|Smartwatch|0          |1         |0              |
|106|Monitor   |1          |0         |0              |
+---+----------+-----------+----------+---------------+



In [0]:
# 3. Calculate Customer Lifetime Value (CLV)
# Business Need: Find the total amount each user has spent.
# Insight: Helps in segmenting high-value customers for exclusive deals.

# The tables that can answer the above question are : users and orders

output6 = spark.sql("""select u.id, u.name, u.email, temp.lifetime_val
from users u
join
    (select o.user_id, sum(total_amt) as lifetime_val
from orders o
group by o.user_id)temp
on u.id = temp.user_id
order by temp.lifetime_val desc""")

output6.show(truncate=False)

+---+-------+--------------------------+------------+
|id |name   |email                     |lifetime_val|
+---+-------+--------------------------+------------+
|2  |Bob    |bob.smith@example.com     |1999.00     |
|1  |Alice  |alice.johnson@example.com |1250.00     |
|6  |Frank  |frank.miller@example.com  |999.00      |
|4  |David  |david.williams@example.com|450.00      |
|7  |Grace  |grace.wilson@example.com  |350.00      |
|3  |Charlie|charlie.brown@example.com |250.00      |
|5  |Emma   |emma.davis@example.com    |180.00      |
|8  |Hannah |hannah.moore@example.com  |120.00      |
|10 |Jack   |jack.anderson@example.com |100.00      |
|9  |Ian    |ian.taylor@example.com    |80.00       |
+---+-------+--------------------------+------------+

