In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import seaborn as sns
spark=SparkSession.builder.appName('Apple DataSet').getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Products 

In [2]:
products_schema=StructType([StructField('Product_ID',StringType(),True),
                           StructField('Product_Name',StringType(),True),
                            StructField('Category_ID',StringType(),True),
                             StructField('Launch_Date',StringType(),True),
                             StructField('Price',IntegerType(),True)
                            
                           ])
products=spark.read.csv('products.csv',schema=products_schema,header=True)
products = products.withColumn("launch_date", to_date("launch_date","dd-MM-yyyy"))

products.show()

+----------+--------------------+-----------+-----------+-----+
|Product_ID|        Product_Name|Category_ID|launch_date|Price|
+----------+--------------------+-----------+-----------+-----+
|       P-1|             MacBook|      CAT-1| 2023-09-17| 1149|
|       P-2|    MacBook Air (M1)|      CAT-1| 2023-11-11| 1783|
|       P-3|    MacBook Air (M2)|      CAT-1| 2020-05-24| 1588|
|       P-4| MacBook Pro 13-inch|      CAT-1| 2021-01-17| 1351|
|       P-5| MacBook Pro 14-inch|      CAT-1| 2024-05-12|  768|
|       P-6| MacBook Pro 16-inch|      CAT-1| 2021-12-28| 1179|
|       P-7|    MacBook (Retina)|      CAT-1| 2023-10-17|  231|
|       P-8|MacBook Air (Retina)|      CAT-1| 2020-01-16| 1847|
|       P-9|MacBook Pro (Touc...|      CAT-1| 2024-06-17| 1304|
|      P-10|MacBook (Early 2015)|      CAT-1| 2022-11-19|  741|
|      P-11|AirPods (2nd Gene...|      CAT-2| 2022-06-13|  504|
|      P-12|AirPods (3rd Gene...|      CAT-2| 2021-05-30| 1842|
|      P-13|         AirPods Pro|      C

In [3]:
# checking for missing values
products.select([sum(col(c).isNull().cast('int')).alias(c) for c in products.columns]).show()

+----------+------------+-----------+-----------+-----+
|Product_ID|Product_Name|Category_ID|launch_date|Price|
+----------+------------+-----------+-----------+-----+
|         0|           0|          0|          0|    0|
+----------+------------+-----------+-----------+-----+



In [4]:
# checking for duplicates
products.groupby(['Product_ID','Product_Name','Category_ID']).agg(count('Product_ID').alias('count_of_Product_ID')).filter('count_of_Product_ID>1').show()

+----------+------------+-----------+-------------------+
|Product_ID|Product_Name|Category_ID|count_of_Product_ID|
+----------+------------+-----------+-------------------+
+----------+------------+-----------+-------------------+



#  category

In [5]:
category_schema=StructType([StructField('category_id',StringType(),True),
                           StructField('category_name',StringType(),True)])
category=spark.read.csv('category.csv',schema=category_schema,header=True)
category.show()

+-----------+--------------------+
|category_id|       category_name|
+-----------+--------------------+
|      CAT-1|              Laptop|
|      CAT-2|               Audio|
|      CAT-3|              Tablet|
|      CAT-4|          Smartphone|
|      CAT-5|            Wearable|
|      CAT-6|    Streaming Device|
|      CAT-7|             Desktop|
|      CAT-8|Subscription Service|
|      CAT-9|       Smart Speaker|
|     CAT-10|         Accessories|
+-----------+--------------------+



# sales

In [6]:
sales=spark.read.csv('sales.csv',inferSchema=True,header=True)
sales.printSchema()

root
 |-- sale_id: string (nullable = true)
 |-- sale_date: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- quantity: integer (nullable = true)



In [7]:
sales_schema = StructType([
    StructField('sale_id', StringType(), True),
    StructField('sale_date', StringType(), True), 
    StructField('store_id', StringType(), True),
    StructField('product_id', StringType(), True),
    StructField('quantity', IntegerType(), True)
])
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
sales = spark.read.csv('sales.csv', schema=sales_schema, header=True)
sales = sales.withColumn("sale_date", to_date("sale_date","dd-MM-yyyy"))
sales.show()

+---------+----------+--------+----------+--------+
|  sale_id| sale_date|store_id|product_id|quantity|
+---------+----------+--------+----------+--------+
|  YG-8782|2023-06-16|   ST-10|      P-38|      10|
|QX-999001|2022-04-13|   ST-63|      P-48|      10|
| JG-46890|2021-07-05|   ST-26|      P-79|       5|
|  XJ-1731|2022-07-20|   ST-15|      P-24|       9|
| FG-95080|2022-03-18|   ST-35|      P-69|       7|
|  II-3954|2024-04-19|   ST-28|      P-45|      10|
|  ES-7238|2020-05-11|    ST-3|      P-24|       6|
|YO-701453|2023-03-27|   ST-73|       P-9|       7|
| JB-46303|2024-08-31|   ST-14|      P-29|       5|
|  QY-9761|2021-07-14|   ST-44|      P-24|       1|
|  RR-9679|2021-11-15|   ST-58|      P-54|       9|
|PT-825391|2021-08-10|    ST-7|      P-62|       8|
|  JY-9420|2020-05-14|   ST-62|      P-47|       9|
|  WB-9150|2022-11-04|   ST-18|      P-57|       7|
|UN-941274|2023-10-27|   ST-10|      P-87|       4|
|KL-629620|2021-05-29|   ST-70|      P-52|       7|
|  LH-1755|2

In [8]:
# Checking Missing Data in sales data
sales.select([sum(col(c).isNull().cast('int')).alias(c) for c in sales.columns]).show()

+-------+---------+--------+----------+--------+
|sale_id|sale_date|store_id|product_id|quantity|
+-------+---------+--------+----------+--------+
|      0|        0|       0|         0|       0|
+-------+---------+--------+----------+--------+



In [9]:
# checking for duplicates
sales.groupby('sale_id').agg(count('sale_id').alias('count_of_salesid')).filter('count_of_salesid>1').show()

+-------+----------------+
|sale_id|count_of_salesid|
+-------+----------------+
+-------+----------------+



# stores

In [10]:
store_schema = StructType([
    StructField('Store_ID',   StringType(), True),
    StructField('Store_Name', StringType(), True),
    StructField('City',       StringType(), True),
    StructField('Country',    StringType(), True)
])
stores = spark.read.csv('stores.csv', schema=store_schema, header=True)
stores.show()

+--------+--------------------+-------------+--------------+
|Store_ID|          Store_Name|         City|       Country|
+--------+--------------------+-------------+--------------+
|    ST-1|  Apple Fifth Avenue|     New York| United States|
|    ST-2|  Apple Union Square|San Francisco| United States|
|    ST-3|Apple Michigan Av...|      Chicago| United States|
|    ST-4|     Apple The Grove|  Los Angeles| United States|
|    ST-5|          Apple SoHo|     New York| United States|
|    ST-6| Apple Grand Central|     New York| United States|
|    ST-7|Apple Beverly Center|  Los Angeles| United States|
|    ST-8| Apple Pioneer Place|     Portland| United States|
|    ST-9|Apple Park Visito...|    Cupertino| United States|
|   ST-10|Apple South Coast...|   Costa Mesa| United States|
|   ST-11|     Apple Ala Moana|     Honolulu| United States|
|   ST-12|Apple North Michi...|      Chicago| United States|
|   ST-13| Apple Walnut Street| Philadelphia| United States|
|   ST-14|Apple The Amer

In [11]:
#cheking missing values in stores dataset
stores.select([sum(col(c).isNull().cast('int')).alias(c) for c in stores.columns]).show()

+--------+----------+----+-------+
|Store_ID|Store_Name|City|Country|
+--------+----------+----+-------+
|       0|         0|   0|      0|
+--------+----------+----+-------+



In [12]:
# checking for duplicates
stores.groupby(['Store_ID','Store_Name','City','Country']).agg(count('Store_id').alias('total_countof_storeid')).filter('total_countof_storeid>1').show()

+--------+----------+----+-------+---------------------+
|Store_ID|Store_Name|City|Country|total_countof_storeid|
+--------+----------+----+-------+---------------------+
+--------+----------+----+-------+---------------------+



# warranty

In [13]:
store_schema = StructType([
    StructField("claim_id", StringType(), True),
    StructField("claim_date", StringType(), True),
    StructField("sale_id", StringType(), True),
    StructField("repair_status", StringType(), True)
])
warranty=spark.read.csv('warranty.csv',schema=store_schema,header=True)
warranty = warranty.withColumn("claim_date", to_date("claim_date","dd-MM-yyyy"))
warranty.show()


+--------+----------+---------+-------------+
|claim_id|claim_date|  sale_id|repair_status|
+--------+----------+---------+-------------+
|CL-58750|2024-01-30|  YG-8782|    Completed|
| CL-8874|2024-06-25|QX-999001|      Pending|
|CL-14486|2024-08-13| JG-46890|      Pending|
|CL-42187|2024-09-19|  XJ-1731|      Pending|
|CL-37590|2024-09-16| FG-95080|    Completed|
|CL-30514|2024-09-05|  II-3954|      Pending|
|CL-47122|2024-07-07|  ES-7238|    Completed|
| CL-5722|2024-03-10|YO-701453|  In Progress|
|CL-75561|2024-01-24| JB-46303|    Completed|
| CL-3634|2024-10-26|  QY-9761|  In Progress|
| CL-8719|2024-07-14|  RR-9679|     Rejected|
|CL-49704|2024-04-03|PT-825391|     Rejected|
|CL-35213|2024-01-23|  JY-9420|    Completed|
|CL-27667|2024-10-06|  WB-9150|      Pending|
| CL-7124|2024-01-04|UN-941274|    Completed|
| CL-3763|2024-02-19|KL-629620|    Completed|
| CL-6383|2024-02-05|  LH-1755|      Pending|
| CL-6345|2024-04-16|LK-385613|  In Progress|
|CL-78615|2024-11-09| JM-75607|   

In [14]:
# checking Missing Value in warranty dataset
warranty.select([sum(col(c).isNull().cast('int')).alias(c) for c in warranty.columns]).show()

+--------+----------+-------+-------------+
|claim_id|claim_date|sale_id|repair_status|
+--------+----------+-------+-------------+
|       0|         0|      0|            0|
+--------+----------+-------+-------------+



In [15]:
# checking for duplicate in warranty
warranty.groupby(['claim_id','claim_date','sale_id','repair_status']).agg(count('claim_date').alias('no_of_claimid')).filter('no_of_claimid>1').show()

+--------+----------+-------+-------------+-------------+
|claim_id|claim_date|sale_id|repair_status|no_of_claimid|
+--------+----------+-------+-------------+-------------+
+--------+----------+-------+-------------+-------------+



In [16]:
# total no of stores
stores.count()

75

In [17]:
# total no of products
products.count()

89

In [18]:
# total no of products category
category.count()

10

In [19]:
# Ensure sale_date >= launch_date

prosal_join=sales.join(products,'Product_ID')
prosal_join.filter(col('Sale_date') < col('Launch_Date')) \
           .select('Product_ID') \
           .distinct() \
           .show()

+----------+
|Product_ID|
+----------+
|      P-53|
|      P-75|
|      P-56|
|      P-88|
|      P-23|
|       P-7|
|      P-55|
|      P-12|
|      P-78|
|       P-1|
|      P-35|
|      P-57|
|      P-18|
|      P-69|
|      P-28|
|      P-19|
|      P-64|
|      P-74|
|      P-45|
|      P-22|
+----------+
only showing top 20 rows



In [20]:
# Total Revenue
product_sales_join=products.join(sales,'product_id')
revenue=product_sales_join.groupby(['product_name','product_id']).agg(sum(col('Price')*col('quantity')).alias('Revenue'))
revenue.show()

+--------------------+----------+---------+
|        product_name|product_id|  Revenue|
+--------------------+----------+---------+
|            Mac Mini|      P-65| 35880750|
|MacBook Pro (Touc...|       P-9| 83053064|
|   Beats Studio Buds|      P-16| 48555402|
|iPad mini (5th Ge...|      P-25|123951136|
|      iPhone 14 Plus|      P-33| 81068269|
|    iPad Pro 11-inch|      P-28|114056448|
|          Mac Studio|      P-64| 39241808|
|   iPhone 12 Pro Max|      P-44| 35010752|
|           Apple TV+|      P-69| 25897612|
|iPad (10th Genera...|      P-22|111919645|
|Silicone Case for...|      P-85| 78559250|
|       iPhone 12 Pro|      P-43| 84240024|
|Apple Pencil (1st...|      P-80| 58706975|
|Apple Watch Series 5|      P-53| 83695680|
|       iPhone 13 Pro|      P-38| 19622988|
| MacBook Pro 13-inch|       P-4| 86110038|
|Smart Keyboard Folio|      P-82| 36523916|
|MacBook (Early 2015)|      P-10| 47412885|
|Apple Watch Series 7|      P-51| 94072509|
| MacBook Pro 16-inch|       P-6

In [21]:
#Total Units Sold
sales.groupby('product_id').agg(sum('quantity').alias('total_units_sold')).show()

+----------+----------------+
|product_id|total_units_sold|
+----------+----------------+
|      P-53|           64580|
|      P-75|           64807|
|      P-56|           64524|
|      P-88|           64736|
|      P-23|           63597|
|      P-55|           64035|
|       P-7|           64422|
|      P-12|           64102|
|      P-78|           64417|
|       P-1|           63980|
|      P-35|           64227|
|      P-57|           64115|
|      P-18|           63487|
|      P-69|           64103|
|      P-28|           64768|
|      P-19|           63396|
|      P-64|           64016|
|      P-74|           65623|
|      P-45|           65481|
|      P-22|           64507|
+----------+----------------+
only showing top 20 rows



In [22]:
#Average Selling Price (ASP) per product and category.
prosal_join=sales.join(products,'Product_ID')
prosal_join.groupby(['product_id','category_id']).agg(avg('price').alias('average_selling_price')).show()

+----------+-----------+---------------------+
|product_id|category_id|average_selling_price|
+----------+-----------+---------------------+
|      P-89|     CAT-10|               1434.0|
|      P-50|      CAT-5|               1631.0|
|      P-66|      CAT-7|                874.0|
|      P-69|      CAT-8|                404.0|
|      P-47|      CAT-5|               1546.0|
|      P-25|      CAT-3|               1912.0|
|      P-56|      CAT-6|                477.0|
|      P-38|      CAT-4|                308.0|
|      P-39|      CAT-4|                820.0|
|      P-64|      CAT-7|                613.0|
|      P-17|      CAT-2|               1839.0|
|      P-75|      CAT-9|                266.0|
|      P-88|     CAT-10|                677.0|
|      P-78|     CAT-10|                313.0|
|      P-23|      CAT-3|               1949.0|
|      P-86|     CAT-10|               1389.0|
|      P-35|      CAT-4|                678.0|
|      P-81|     CAT-10|               1506.0|
|      P-84| 

In [23]:
# Top 10 Products by revenue 
top_10_products=revenue.select('product_id','revenue').orderBy(col('Revenue').desc())
top_10_products.limit(10).show()



+----------+---------+
|product_id|  revenue|
+----------+---------+
|      P-68|125453460|
|      P-58|124006372|
|      P-25|123951136|
|      P-23|123950553|
|      P-17|118449990|
|       P-8|118110109|
|      P-12|118075884|
|      P-31|116029518|
|      P-28|114056448|
|       P-2|113204453|
+----------+---------+



In [24]:
#Top 5 Categories  by revenue 
product_sales_join=products.join(sales,'product_id')
revenue_by_category=product_sales_join.groupby('category_id').agg(sum(col('Price')*col('quantity')).alias('Revenue')).orderBy(col('Revenue').desc())
revenue_by_category.limit(5).show()

+-----------+---------+
|category_id|  Revenue|
+-----------+---------+
|      CAT-3|953443623|
|     CAT-10|927115953|
|      CAT-4|865147932|
|      CAT-2|794980579|
|      CAT-1|763382551|
+-----------+---------+



In [25]:
#Top 5 Stores  by revenu
product_sales_join.join(stores,'store_id').groupby('store_id')\
.agg(sum(col('price')*col('quantity')).alias('revenue'))\
.orderBy(col('revenue').desc()).limit(5).show()

+--------+--------+
|store_id| revenue|
+--------+--------+
|   ST-30|84478026|
|   ST-56|83830993|
|   ST-44|83706593|
|   ST-73|83702690|
|   ST-34|83576408|
+--------+--------+



In [26]:
# top 10  city by revenue
top_10_city_by_revenue=product_sales_join.join(stores,'store_id').groupby('city').agg(sum(col('price')*col('quantity'))\
.alias('revenue')).orderBy(col('revenue').desc())
top_10_city_by_revenue.limit(10).show()

+-----------+---------+
|       city|  revenue|
+-----------+---------+
|      Dubai|329415320|
|     London|328969720|
|      Paris|325659768|
|   New York|249137133|
|  Melbourne|247980340|
|      Tokyo|246179087|
|Mexico City|245375347|
|  Singapore|244923010|
|    Bangkok|244917957|
|      Seoul|166244072|
+-----------+---------+



In [27]:
# top 10  country by revenue
top_10_country_by_revenue=product_sales_join.join(stores,'store_id').groupby('country').agg(sum(col('price')*col('quantity'))\
.alias('revenue')).orderBy(col('revenue').desc())
top_10_country_by_revenue.limit(10).show()

+--------------+----------+
|       country|   revenue|
+--------------+----------+
| United States|1234842804|
|     Australia| 577191700|
|         China| 575151496|
|         Japan| 495948710|
|           UAE| 412424721|
|        Canada| 410740833|
|United Kingdom| 328969720|
|        France| 325659768|
|       Germany| 248821585|
|        Mexico| 245375347|
+--------------+----------+



In [28]:
# Monthly Revenue Trend.
Monthly_Revenue_Trend=product_sales_join.groupby(month('sale_date').alias('month')).agg(sum(col('price')*col('quantity'))\
.alias('revenue')).orderBy(col('revenue').desc())
Monthly_Revenue_Trend.show()

+-----+---------+
|month|  revenue|
+-----+---------+
|    3|542790865|
|    1|539339571|
|    8|536711821|
|    5|536469161|
|   10|536190097|
|    7|534938767|
|    4|522063998|
|    6|521085473|
|    9|518731293|
|    2|491218827|
|   11|461412527|
|   12|425340630|
+-----+---------+



In [29]:
products.printSchema()

root
 |-- Product_ID: string (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Category_ID: string (nullable = true)
 |-- launch_date: date (nullable = true)
 |-- Price: integer (nullable = true)



# phase 3

In [30]:
product_sales_join.join(category,'category_id').groupby(['category_name','category_id'])\
.agg(sum(col('price')*col('quantity')).alias('revenue')).show()

+--------------------+-----------+---------+
|       category_name|category_id|  revenue|
+--------------------+-----------+---------+
|          Smartphone|      CAT-4|865147932|
|       Smart Speaker|      CAT-9| 96117508|
|Subscription Service|      CAT-8|368463489|
|              Tablet|      CAT-3|953443623|
|            Wearable|      CAT-5|667537447|
|              Laptop|      CAT-1|763382551|
|    Streaming Device|      CAT-6|191622603|
|         Accessories|     CAT-10|927115953|
|             Desktop|      CAT-7|538481345|
|               Audio|      CAT-2|794980579|
+--------------------+-----------+---------+



In [31]:
# Warranty Claim Rate per Product
sale_warranty_join=sales.join(warranty,'sale_id')
sale_warranty_join.join(products,'product_id').groupby(['product_id','product_name'])\
.agg(count('claim_id').alias('total_no_of_claims')).orderBy(col('total_no_of_claims').desc()).show()

+----------+--------------------+------------------+
|product_id|        product_name|total_no_of_claims|
+----------+--------------------+------------------+
|       P-9|MacBook Pro (Touc...|               381|
|      P-39|   iPhone 13 Pro Max|               372|
|      P-17|       Beats Fit Pro|               370|
|      P-83|Smart Cover for iPad|               370|
|      P-69|           Apple TV+|               368|
|      P-38|       iPhone 13 Pro|               367|
|      P-45|Apple Watch Series 9|               367|
|      P-87|     MagSafe Charger|               366|
|      P-74|             HomePod|               365|
|      P-24|iPad mini (6th Ge...|               362|
|      P-30|       iPad Pro (M1)|               362|
|      P-16|   Beats Studio Buds|               361|
|      P-27|iPad Air (4th Gen...|               361|
|      P-32|           iPhone 14|               359|
|      P-65|            Mac Mini|               359|
|      P-20|HomePod (2nd Gene...|             

In [32]:
# Repair Status Breakdown (% repaired, open, replaced)
total_no_of_claims=warranty.count()
warranty.groupby('repair_status').agg((count('claim_id')/total_no_of_claims)*100).show()

+-------------+---------------------------------+
|repair_status|((count(claim_id) / 30000) * 100)|
+-------------+---------------------------------+
|    Completed|               24.886666666666667|
|     Rejected|               24.523333333333333|
|  In Progress|               25.369999999999997|
|      Pending|                            25.22|
+-------------+---------------------------------+



In [33]:
# Time-to-Claim Analysis
sale_warranty_join.withColumn('time_took_to_claim',months_between('sale_date','claim_date')).show()


+---------+----------+--------+----------+--------+--------+----------+-------------+------------------+
|  sale_id| sale_date|store_id|product_id|quantity|claim_id|claim_date|repair_status|time_took_to_claim|
+---------+----------+--------+----------+--------+--------+----------+-------------+------------------+
|  YG-8782|2023-06-16|   ST-10|      P-38|      10|CL-58750|2024-01-30|    Completed|        -7.4516129|
|QX-999001|2022-04-13|   ST-63|      P-48|      10| CL-8874|2024-06-25|      Pending|      -26.38709677|
| JG-46890|2021-07-05|   ST-26|      P-79|       5|CL-14486|2024-08-13|      Pending|      -37.25806452|
|  XJ-1731|2022-07-20|   ST-15|      P-24|       9|CL-42187|2024-09-19|      Pending|      -25.96774194|
| FG-95080|2022-03-18|   ST-35|      P-69|       7|CL-37590|2024-09-16|    Completed|      -29.93548387|
|  II-3954|2024-04-19|   ST-28|      P-45|      10|CL-30514|2024-09-05|      Pending|        -4.5483871|
|  ES-7238|2020-05-11|    ST-3|      P-24|       6|CL-4

In [34]:
# Product Launch Impact → revenue in first 30,

product_sales_join.filter(datediff('Launch_Date','sale_date')<=30).groupby('product_name').agg(sum(col('price')*col('quantity')).alias('revenue')).show()

+--------------------+--------+
|        product_name| revenue|
+--------------------+--------+
|              iCloud|60817957|
|        HomePod mini|20674680|
|AirPods (3rd Gene...|85765362|
|MacBook (Early 2015)|20101848|
|            iMac Pro|46125189|
|   Apple Watch Ultra| 1340404|
|           Apple TV+|  604788|
|Beats Powerbeats Pro|55104100|
|HomePod (2nd Gene...|38402104|
|         AirPods Max|49002107|
|      Magic Keyboard|21906304|
|    MacBook Air (M1)|25131385|
|   iPhone 14 Pro Max| 1681440|
|Apple Watch Nike ...|52409708|
|iMac with Retina ...|20192410|
|Leather Case for ...|28944318|
|        Apple Arcade| 1817190|
|           Apple One|29110176|
|       iPhone 12 Pro| 6259088|
|             MacBook|18978033|
+--------------------+--------+
only showing top 20 rows



In [35]:
# Product Launch Impact → revenue in first 60
product_sales_join.filter(datediff('Launch_Date','sale_date')<=60).groupby('product_name').agg(sum(col('price')*col('quantity')).alias('revenue')).show()

+--------------------+--------+
|        product_name| revenue|
+--------------------+--------+
|              iCloud|62125324|
|        HomePod mini|21482402|
|AirPods (3rd Gene...|87695778|
|MacBook (Early 2015)|20808021|
|            iMac Pro|47322937|
|   Apple Watch Ultra| 1678446|
|           Apple TV+| 1049996|
|Beats Powerbeats Pro|56153500|
|HomePod (2nd Gene...|39057343|
|         AirPods Max|50712508|
|      Magic Keyboard|22347168|
|    MacBook Air (M1)|27062374|
|   iPhone 14 Pro Max| 2445546|
|Apple Watch Nike ...|53352052|
|iMac with Retina ...|20529236|
|Leather Case for ...|30030282|
|        Apple Arcade| 2367837|
|           Apple One|29666496|
|       iPhone 12 Pro| 7402472|
|             MacBook|20230443|
+--------------------+--------+
only showing top 20 rows



In [36]:
# Product Launch Impact → revenue in first 90
product_sales_join.filter(datediff('Launch_Date','sale_date')<=90).groupby('product_name').agg(sum(col('price')*col('quantity')).alias('revenue')).show()

+--------------------+--------+
|        product_name| revenue|
+--------------------+--------+
|              iCloud|63500008|
|        HomePod mini|22189694|
|AirPods (3rd Gene...|89663034|
|MacBook (Early 2015)|21581625|
|            iMac Pro|48315295|
|   Apple Watch Ultra| 2069080|
|           Apple TV+| 1500052|
|Beats Powerbeats Pro|57262260|
|HomePod (2nd Gene...|39866229|
|         AirPods Max|52413291|
|      Magic Keyboard|22722160|
|    MacBook Air (M1)|28989797|
|   iPhone 14 Pro Max| 3147276|
|Apple Watch Nike ...|54291744|
|iMac with Retina ...|20928145|
|Leather Case for ...|31117112|
|        Apple Arcade| 2970639|
|           Apple One|30181336|
|       iPhone 12 Pro| 8964560|
|             MacBook|21464469|
+--------------------+--------+
only showing top 20 rows

