In [1]:
# Install Java and Spark
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark



tar: spark-3.4.1-bin-hadoop3.tgz: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


In [2]:
# Step 1: Install Java, Spark, and findspark in Google Colab

!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar -xzf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark


In [3]:
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ProductSalesAnalysis") \
    .getOrCreate()


In [4]:
# Write the CSV content to a file
csv_content = """OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North
"""

with open("sales.csv", "w") as f:
    f.write(csv_content)


In [5]:
# Read the CSV file into DataFrame
df = spark.read.csv("sales.csv", header=True, inferSchema=True)

# Print schema
df.printSchema()

# Show top 5 rows
df.show(5)


root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Region: string (nullable = true)

+-------+-------+-----------+--------+---------+------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|
+-------+-------+-----------+--------+---------+------+
|   1001| Mobile|Electronics|       2|    15000| North|
|   1002| Laptop|Electronics|       1|    55000| South|
|   1003|T-Shirt|    Apparel|       3|      500|  East|
|   1004|  Jeans|    Apparel|       2|     1200| North|
|   1005|     TV|Electronics|       1|    40000|  West|
+-------+-------+-----------+--------+---------+------+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import col, sum

# Step 1: Add TotalPrice column
df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))

# Step 2: Calculate Total Revenue
total_revenue_df = df.agg(sum("TotalPrice").alias("Total_Revenue"))
total_revenue_df.show()


+-------------+
|Total_Revenue|
+-------------+
|       150400|
+-------------+



In [13]:
total_revenue_df.coalesce(1).write.mode("overwrite").option("header", True).csv("total_revenue")


In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, desc

# Start Spark session
spark = SparkSession.builder.appName("Sales Analysis").getOrCreate()

# Load the CSV file
df = spark.read.csv("sales.csv", header=True, inferSchema=True)

# Show schema and data
df.printSchema()
df.show(5)


root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Region: string (nullable = true)

+-------+-------+-----------+--------+---------+------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|
+-------+-------+-----------+--------+---------+------+
|   1001| Mobile|Electronics|       2|    15000| North|
|   1002| Laptop|Electronics|       1|    55000| South|
|   1003|T-Shirt|    Apparel|       3|      500|  East|
|   1004|  Jeans|    Apparel|       2|     1200| North|
|   1005|     TV|Electronics|       1|    40000|  West|
+-------+-------+-----------+--------+---------+------+
only showing top 5 rows



In [16]:
df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.select("OrderID", "Product", "Quantity", "UnitPrice", "TotalPrice").show()


+-------+----------+--------+---------+----------+
|OrderID|   Product|Quantity|UnitPrice|TotalPrice|
+-------+----------+--------+---------+----------+
|   1001|    Mobile|       2|    15000|     30000|
|   1002|    Laptop|       1|    55000|     55000|
|   1003|   T-Shirt|       3|      500|      1500|
|   1004|     Jeans|       2|     1200|      2400|
|   1005|        TV|       1|    40000|     40000|
|   1006|     Shoes|       4|     2000|      8000|
|   1007|     Watch|       2|     3000|      6000|
|   1008|Headphones|       3|     2500|      7500|
+-------+----------+--------+---------+----------+



In [17]:
df.agg(sum("TotalPrice").alias("Total_Revenue")).show()


+-------------+
|Total_Revenue|
+-------------+
|       150400|
+-------------+



In [18]:
df.groupBy("Category") \
  .agg(sum("TotalPrice").alias("Category_Revenue")) \
  .orderBy(desc("Category_Revenue")) \
  .show()


+-----------+----------------+
|   Category|Category_Revenue|
+-----------+----------------+
|Electronics|          132500|
|   Footwear|            8000|
|Accessories|            6000|
|    Apparel|            3900|
+-----------+----------------+



In [19]:
from pyspark.sql.functions import count

df.groupBy("Region") \
  .agg(count("OrderID").alias("Num_Orders")) \
  .orderBy(desc("Num_Orders")) \
  .show(1)


+------+----------+
|Region|Num_Orders|
+------+----------+
| North|         3|
+------+----------+
only showing top 1 row



In [20]:
df.groupBy("Category") \
  .agg(avg("UnitPrice").alias("Avg_UnitPrice")) \
  .orderBy("Category") \
  .show()


+-----------+-------------+
|   Category|Avg_UnitPrice|
+-----------+-------------+
|Accessories|       3000.0|
|    Apparel|        850.0|
|Electronics|      28125.0|
|   Footwear|       2000.0|
+-----------+-------------+



In [21]:
df.filter(col("TotalPrice") > 30000).select("OrderID", "Product", "TotalPrice").show()


+-------+-------+----------+
|OrderID|Product|TotalPrice|
+-------+-------+----------+
|   1002| Laptop|     55000|
|   1005|     TV|     40000|
+-------+-------+----------+



In [24]:
from pyspark.sql.functions import sum, desc

category_revenue_df = df.groupBy("Category") \
    .agg(sum("TotalPrice").alias("Category_Revenue")) \
    .orderBy(desc("Category_Revenue"))

# View the result
category_revenue_df.show()

# Save the result
category_revenue_df.coalesce(1).write.mode("overwrite").option("header", True).csv("output/category_revenue")


+-----------+----------------+
|   Category|Category_Revenue|
+-----------+----------------+
|Electronics|          132500|
|   Footwear|            8000|
|Accessories|            6000|
|    Apparel|            3900|
+-----------+----------------+



In [25]:
category_revenue_df.coalesce(1).write.mode("overwrite").option("header", True).csv("output/category_revenue")


In [26]:
import os
from google.colab import files

# Locate the actual CSV file inside the folder
csv_file = [f for f in os.listdir("output/category_revenue") if f.endswith(".csv")][0]

# Download it
files.download(f"output/category_revenue/{csv_file}")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, desc, avg

# Step 1: Create Spark session
spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()

# Step 2: Load CSV into DataFrame
df = spark.read.csv("sales.csv", header=True, inferSchema=True)

# Step 3: Add TotalPrice column
df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))

# 1️ Total revenue generated across all regions
total_revenue = df.agg(sum("TotalPrice").alias("Total_Revenue"))
total_revenue.show()

# 2️ Category-wise revenue sorted in descending order
category_revenue = df.groupBy("Category") \
                     .agg(sum("TotalPrice").alias("Category_Revenue")) \
                     .orderBy(desc("Category_Revenue"))
category_revenue.show()

# 3️ Region with the highest number of orders
from pyspark.sql.functions import count
region_orders = df.groupBy("Region").agg(count("OrderID").alias("Order_Count")) \
                  .orderBy(desc("Order_Count"))
region_orders.show(1)


# 4️ Average Unit Price per Category
avg_price = df.groupBy("Category").agg(avg("UnitPrice").alias("Avg_UnitPrice"))
avg_price.show()



# 5️ All orders where TotalPrice is more than 30,000
high_value_orders = df.filter(col("TotalPrice") > 30000)
high_value_orders.show()



+-------------+
|Total_Revenue|
+-------------+
|       150400|
+-------------+

+-----------+----------------+
|   Category|Category_Revenue|
+-----------+----------------+
|Electronics|          132500|
|   Footwear|            8000|
|Accessories|            6000|
|    Apparel|            3900|
+-----------+----------------+

+------+-----------+
|Region|Order_Count|
+------+-----------+
| North|          3|
+------+-----------+
only showing top 1 row

+-----------+-------------+
|   Category|Avg_UnitPrice|
+-----------+-------------+
|    Apparel|        850.0|
|Electronics|      28125.0|
|   Footwear|       2000.0|
|Accessories|       3000.0|
+-----------+-------------+

+-------+-------+-----------+--------+---------+------+----------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+-------+-----------+--------+---------+------+----------+
|   1002| Laptop|Electronics|       1|    55000| South|     55000|
|   1005|     TV|Electronics|       1|    40000|

In [28]:
category_revenue.coalesce(1).write.mode("overwrite").option("header", True).csv("output/category_revenue")


In [29]:
import os
from google.colab import files

csv_file = [f for f in os.listdir("output/category_revenue") if f.endswith(".csv")][0]
files.download(f"output/category_revenue/{csv_file}")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
from pyspark.sql.functions import when, count

# 1️ Create new column: HighValueOrder ("Yes" if TotalPrice > 20000, else "No")
df = df.withColumn("HighValueOrder", when(col("TotalPrice") > 20000, "Yes").otherwise("No"))
df.select("OrderID", "Product", "TotalPrice", "HighValueOrder").show()

# 2️ Filter and display all high-value orders in the North region
north_high_value = df.filter((col("HighValueOrder") == "Yes") & (col("Region") == "North"))
north_high_value.show()

# 3️ Count how many high-value orders exist per region
high_value_count = df.filter(col("HighValueOrder") == "Yes") \
                     .groupBy("Region") \
                     .agg(count("*").alias("HighValueOrder_Count"))
high_value_count.show()


+-------+----------+----------+--------------+
|OrderID|   Product|TotalPrice|HighValueOrder|
+-------+----------+----------+--------------+
|   1001|    Mobile|     30000|           Yes|
|   1002|    Laptop|     55000|           Yes|
|   1003|   T-Shirt|      1500|            No|
|   1004|     Jeans|      2400|            No|
|   1005|        TV|     40000|           Yes|
|   1006|     Shoes|      8000|            No|
|   1007|     Watch|      6000|            No|
|   1008|Headphones|      7500|            No|
+-------+----------+----------+--------------+

+-------+-------+-----------+--------+---------+------+----------+--------------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+-------+-----------+--------+---------+------+----------+--------------+
|   1001| Mobile|Electronics|       2|    15000| North|     30000|           Yes|
+-------+-------+-----------+--------+---------+------+----------+--------------+

+------+----------------

In [31]:
# Save High-Value Orders in North Region to CSV
north_high_value.coalesce(1).write.mode("overwrite").option("header", True).csv("north_high_value_orders")

# Save High-Value Order Count per Region to CSV
high_value_count.coalesce(1).write.mode("overwrite").option("header", True).csv("high_value_count_by_region")


In [32]:
# Zip the output folders
!zip -r north_high_value_orders.zip north_high_value_orders
!zip -r high_value_count_by_region.zip high_value_count_by_region

# Download
from google.colab import files
files.download("north_high_value_orders.zip")
files.download("high_value_count_by_region.zip")


  adding: north_high_value_orders/ (stored 0%)
  adding: north_high_value_orders/._SUCCESS.crc (stored 0%)
  adding: north_high_value_orders/.part-00000-ee62b1ed-2908-46d8-b17d-add2ced9f1e1-c000.csv.crc (stored 0%)
  adding: north_high_value_orders/part-00000-ee62b1ed-2908-46d8-b17d-add2ced9f1e1-c000.csv (deflated 10%)
  adding: north_high_value_orders/_SUCCESS (stored 0%)
  adding: high_value_count_by_region/ (stored 0%)
  adding: high_value_count_by_region/._SUCCESS.crc (stored 0%)
  adding: high_value_count_by_region/.part-00000-2767a342-4e03-4512-9d6d-fc65870005cb-c000.csv.crc (stored 0%)
  adding: high_value_count_by_region/part-00000-2767a342-4e03-4512-9d6d-fc65870005cb-c000.csv (deflated 6%)
  adding: high_value_count_by_region/_SUCCESS (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
# Save the transformed DataFrame with 'HighValueOrder' column
df.coalesce(1).write.mode("overwrite").option("header", True).csv("high_value_orders")

# Zip the folder to download from Colab
!zip -r high_value_orders.zip high_value_orders

# Download the zipped CSV
from google.colab import files
files.download("high_value_orders.zip")


  adding: high_value_orders/ (stored 0%)
  adding: high_value_orders/._SUCCESS.crc (stored 0%)
  adding: high_value_orders/part-00000-8492e30a-3511-4900-9e54-9a1a78a8942b-c000.csv (deflated 43%)
  adding: high_value_orders/_SUCCESS (stored 0%)
  adding: high_value_orders/.part-00000-8492e30a-3511-4900-9e54-9a1a78a8942b-c000.csv.crc (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>