1. Data Ingestion & Exploration

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Load both CSV files with schema inference.

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataIngestionExploration").getOrCreate()

customers_path = "/content/drive/MyDrive/Customers.csv"
orders_path = "/content/drive/MyDrive/Orders.csv"

customers_df = spark.read.option("header", True).option("inferSchema", True).csv(customers_path)
orders_df = spark.read.option("header", True).option("inferSchema", True).csv(orders_path)


List all columns and data types.

In [8]:
print("Customers Schema:")
customers_df.printSchema()

print("Orders Schema:")
orders_df.printSchema()


Customers Schema:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

Orders Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)



Count the total number of customers and orders.

In [9]:
print("Total Customers:", customers_df.count())
print("Total Orders:", orders_df.count())

Total Customers: 5
Total Orders: 7


Show distinct cities.

In [10]:
customers_df.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



2. DataFrame Transformations

Add a column TotalAmount = Price * Quantity .

In [11]:
from pyspark.sql.functions import col

orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))
orders_df.select("OrderID", "Product", "Quantity", "Price", "TotalAmount").show()

+-------+---------+--------+-------+-----------+
|OrderID|  Product|Quantity|  Price|TotalAmount|
+-------+---------+--------+-------+-----------+
|      1|   Laptop|       2|50000.0|   100000.0|
|      2|    Mouse|       1| 1200.0|     1200.0|
|      3|   Tablet|       1|20000.0|    20000.0|
|      4|Bookshelf|       1| 3500.0|     3500.0|
|      5|    Mixer|       1| 5000.0|     5000.0|
|      6| Notebook|       5|  500.0|     2500.0|
|      7|    Phone|       1|30000.0|    30000.0|
+-------+---------+--------+-------+-----------+



Create a new column OrderYear from OrderDate .

In [12]:
from pyspark.sql.functions import year, to_date

orders_df = orders_df.withColumn("OrderDate", to_date(col("OrderDate")))

orders_df = orders_df.withColumn("OrderYear", year(col("OrderDate")))
orders_df.select("OrderID", "OrderDate", "OrderYear").show()

+-------+----------+---------+
|OrderID| OrderDate|OrderYear|
+-------+----------+---------+
|      1|2024-01-10|     2024|
|      2|2024-01-15|     2024|
|      3|2024-02-01|     2024|
|      4|2024-02-10|     2024|
|      5|2024-02-15|     2024|
|      6|2024-03-01|     2024|
|      7|2024-03-02|     2024|
+-------+----------+---------+



Filter orders with TotalAmount > 10,000 .

In [13]:
high_value_orders = orders_df.filter(col("TotalAmount") > 10000)
high_value_orders.select("OrderID", "Product", "TotalAmount").show()

+-------+-------+-----------+
|OrderID|Product|TotalAmount|
+-------+-------+-----------+
|      1| Laptop|   100000.0|
|      3| Tablet|    20000.0|
|      7|  Phone|    30000.0|
+-------+-------+-----------+



Drop the Email column from customers.

In [14]:
customers_df = customers_df.drop("Email")
customers_df.show()

+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|2022-05-10|
|       102| Neha|    Delhi|2023-01-15|
|       103| Ravi|Bangalore|2021-11-01|
|       104|Sneha|Hyderabad|2020-07-22|
|       105| Amit|  Chennai|2023-03-10|
+----------+-----+---------+----------+



3. Handling Nulls & Conditionals

Simulate a null in City and fill it with “Unknown”.

In [15]:
from pyspark.sql.functions import when

customers_df = customers_df.withColumn(
    "City", when(col("CustomerID") == 105, None).otherwise(col("City"))
)

customers_df = customers_df.fillna({"City": "Unknown"})
customers_df.select("CustomerID", "Name", "City").show()

+----------+-----+---------+
|CustomerID| Name|     City|
+----------+-----+---------+
|       101|  Ali|   Mumbai|
|       102| Neha|    Delhi|
|       103| Ravi|Bangalore|
|       104|Sneha|Hyderabad|
|       105| Amit|  Unknown|
+----------+-----+---------+



Label customers as “Loyal” if SignupDate is before 2022, else “New”.

In [16]:
from pyspark.sql.functions import to_date

customers_df = customers_df.withColumn("SignupDate", to_date("SignupDate"))

customers_df = customers_df.withColumn(
    "CustomerType",
    when(col("SignupDate") < "2022-01-01", "Loyal").otherwise("New")
)

customers_df.select("CustomerID", "SignupDate", "CustomerType").show()

+----------+----------+------------+
|CustomerID|SignupDate|CustomerType|
+----------+----------+------------+
|       101|2022-05-10|         New|
|       102|2023-01-15|         New|
|       103|2021-11-01|       Loyal|
|       104|2020-07-22|       Loyal|
|       105|2023-03-10|         New|
+----------+----------+------------+



Create OrderType column: "Low" if <5,000, "High" if ≥5,000.

In [17]:
orders_df = orders_df.withColumn(
    "OrderType",
    when(col("TotalAmount") < 5000, "Low").otherwise("High")
)

orders_df.select("OrderID", "Product", "TotalAmount", "OrderType").show()

+-------+---------+-----------+---------+
|OrderID|  Product|TotalAmount|OrderType|
+-------+---------+-----------+---------+
|      1|   Laptop|   100000.0|     High|
|      2|    Mouse|     1200.0|      Low|
|      3|   Tablet|    20000.0|     High|
|      4|Bookshelf|     3500.0|      Low|
|      5|    Mixer|     5000.0|     High|
|      6| Notebook|     2500.0|      Low|
|      7|    Phone|    30000.0|     High|
+-------+---------+-----------+---------+



Joins & Aggregations

Join customers and orders on CustomerID .

In [18]:
joined_df = orders_df.join(customers_df, on="CustomerID", how="inner")
joined_df.select("OrderID", "Name", "City", "Product", "TotalAmount").show()

+-------+-----+---------+---------+-----------+
|OrderID| Name|     City|  Product|TotalAmount|
+-------+-----+---------+---------+-----------+
|      1|  Ali|   Mumbai|   Laptop|   100000.0|
|      2|  Ali|   Mumbai|    Mouse|     1200.0|
|      3| Neha|    Delhi|   Tablet|    20000.0|
|      4| Ravi|Bangalore|Bookshelf|     3500.0|
|      5|Sneha|Hyderabad|    Mixer|     5000.0|
|      6| Amit|  Unknown| Notebook|     2500.0|
|      7| Neha|    Delhi|    Phone|    30000.0|
+-------+-----+---------+---------+-----------+



Get total orders and revenue per city.

In [19]:
from pyspark.sql.functions import count, sum

city_agg_df = joined_df.groupBy("City") \
    .agg(
        count("OrderID").alias("TotalOrders"),
        sum("TotalAmount").alias("TotalRevenue")
    )
city_agg_df.show()

+---------+-----------+------------+
|     City|TotalOrders|TotalRevenue|
+---------+-----------+------------+
|Bangalore|          1|      3500.0|
|   Mumbai|          2|    101200.0|
|  Unknown|          1|      2500.0|
|    Delhi|          2|     50000.0|
|Hyderabad|          1|      5000.0|
+---------+-----------+------------+



Show top 3 customers by total spend.

In [20]:
customer_spend_df = joined_df.groupBy("CustomerID", "Name") \
    .agg(sum("TotalAmount").alias("TotalSpend")) \
    .orderBy(col("TotalSpend").desc()) \
    .limit(3)

customer_spend_df.show()

+----------+-----+----------+
|CustomerID| Name|TotalSpend|
+----------+-----+----------+
|       101|  Ali|  101200.0|
|       102| Neha|   50000.0|
|       104|Sneha|    5000.0|
+----------+-----+----------+



Count how many products each category has sold.

In [21]:
category_sales_df = orders_df.groupBy("Category") \
    .agg(sum("Quantity").alias("TotalQuantitySold"))

category_sales_df.show()

+-----------+-----------------+
|   Category|TotalQuantitySold|
+-----------+-----------------+
| Stationery|                5|
|Electronics|                5|
|  Furniture|                1|
| Appliances|                1|
+-----------+-----------------+



5. Spark SQL Tasks

Create database sales and switch to it.

In [22]:
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

DataFrame[]

Save both datasets as tables in the sales database.

In [23]:
customers_df.createOrReplaceTempView("customers_view")
orders_df.createOrReplaceTempView("orders_view")

customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

SQL: List all orders by customers from “Delhi”

In [24]:
spark.sql("""
    SELECT o.*
    FROM sales.orders o
    JOIN sales.customers c ON o.CustomerID = c.CustomerID
    WHERE c.City = 'Delhi'
""").show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|     High|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+



SQL: Find average order value in each category

In [25]:
spark.sql("""
    SELECT Category, AVG(Price * Quantity) AS AvgOrderValue
    FROM sales.orders
    GROUP BY Category
""").show()

+-----------+-------------+
|   Category|AvgOrderValue|
+-----------+-------------+
| Stationery|       2500.0|
|Electronics|      37800.0|
|  Furniture|       3500.0|
| Appliances|       5000.0|
+-----------+-------------+



 Create a view monthly_orders with month-wise total amount

In [26]:
spark.sql("DROP VIEW IF EXISTS monthly_orders")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW monthly_orders AS
    SELECT
        DATE_FORMAT(OrderDate, 'yyyy-MM') AS OrderMonth,
        SUM(Price * Quantity) AS TotalMonthlyAmount
    FROM sales.orders
    GROUP BY DATE_FORMAT(OrderDate, 'yyyy-MM')
""")

spark.sql("SELECT * FROM monthly_orders ORDER BY OrderMonth").show()

+----------+------------------+
|OrderMonth|TotalMonthlyAmount|
+----------+------------------+
|   2024-01|          101200.0|
|   2024-02|           28500.0|
|   2024-03|           32500.0|
+----------+------------------+



6. String & Date Functions

Mask emails using regex

In [27]:
from pyspark.sql.functions import regexp_replace

from pyspark.sql.functions import expr

customers_with_email = customers_df.withColumn("Email", expr("""
  CASE CustomerID
    WHEN 101 THEN 'ali@gmail.com'
    WHEN 102 THEN 'neha@yahoo.com'
    WHEN 103 THEN 'ravi@hotmail.com'
    WHEN 104 THEN 'sneha@outlook.com'
    WHEN 105 THEN 'amit@gmail.com'
  END
"""))
masked_email_df = customers_with_email.withColumn(
    "MaskedEmail",
    regexp_replace("Email", r"(^.).*?(@.*$)", r"\1***\2")
)

masked_email_df.select("CustomerID", "Email", "MaskedEmail").show()

+----------+-----------------+-----------+
|CustomerID|            Email|MaskedEmail|
+----------+-----------------+-----------+
|       101|    ali@gmail.com|      1***2|
|       102|   neha@yahoo.com|      1***2|
|       103| ravi@hotmail.com|      1***2|
|       104|sneha@outlook.com|      1***2|
|       105|   amit@gmail.com|      1***2|
+----------+-----------------+-----------+



Concatenate Name and City as “Name from City”.

In [28]:
from pyspark.sql.functions import concat_ws

customers_labeled_df = customers_df.withColumn(
    "NameCity",
    concat_ws(" from ", col("Name"), col("City"))
)
customers_labeled_df.select("CustomerID", "NameCity").show()

+----------+--------------------+
|CustomerID|            NameCity|
+----------+--------------------+
|       101|     Ali from Mumbai|
|       102|     Neha from Delhi|
|       103| Ravi from Bangalore|
|       104|Sneha from Hyderabad|
|       105|   Amit from Unknown|
+----------+--------------------+



Use datediff() to calculate customer age in days.

In [29]:
from pyspark.sql.functions import current_date, datediff

customers_df = customers_df.withColumn("SignupDate", to_date("SignupDate"))
customers_df = customers_df.withColumn(
    "CustomerAgeInDays",
    datediff(current_date(), col("SignupDate"))
)
customers_df.select("CustomerID", "SignupDate", "CustomerAgeInDays").show()

+----------+----------+-----------------+
|CustomerID|SignupDate|CustomerAgeInDays|
+----------+----------+-----------------+
|       101|2022-05-10|             1126|
|       102|2023-01-15|              876|
|       103|2021-11-01|             1316|
|       104|2020-07-22|             1783|
|       105|2023-03-10|              822|
+----------+----------+-----------------+



Extract month name from OrderDate .

In [30]:
from pyspark.sql.functions import date_format

orders_df = orders_df.withColumn("OrderMonthName", date_format("OrderDate", "MMMM"))
orders_df.select("OrderID", "OrderDate", "OrderMonthName").show()

+-------+----------+--------------+
|OrderID| OrderDate|OrderMonthName|
+-------+----------+--------------+
|      1|2024-01-10|       January|
|      2|2024-01-15|       January|
|      3|2024-02-01|      February|
|      4|2024-02-10|      February|
|      5|2024-02-15|      February|
|      6|2024-03-01|         March|
|      7|2024-03-02|         March|
+-------+----------+--------------+



7. UDFs and Complex Logic

Write a UDF to tag customers.

In [31]:
from pyspark.sql.functions import sum

customer_spend = orders_df.groupBy("CustomerID") \
    .agg(sum("TotalAmount").alias("TotalSpend"))

In [32]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def tag_customer(spend):
    if spend > 50000:
        return "Gold"
    elif spend >= 10000:
        return "Silver"
    else:
        return "Bronze"

tag_udf = udf(tag_customer, StringType())

In [33]:
customer_spend = customer_spend.withColumn("CustomerTag", tag_udf(col("TotalSpend")))
customer_spend.show()

+----------+----------+-----------+
|CustomerID|TotalSpend|CustomerTag|
+----------+----------+-----------+
|       101|  101200.0|       Gold|
|       103|    3500.0|     Bronze|
|       102|   50000.0|     Silver|
|       105|    2500.0|     Bronze|
|       104|    5000.0|     Bronze|
+----------+----------+-----------+



Write a UDF to shorten product names

In [34]:
def shorten_product(name):
    if name and len(name) > 3:
        return name[:3] + "..."
    return name

shorten_udf = udf(shorten_product, StringType())

orders_short_df = orders_df.withColumn("ShortProduct", shorten_udf(col("Product")))
orders_short_df.select("Product", "ShortProduct").show()

+---------+------------+
|  Product|ShortProduct|
+---------+------------+
|   Laptop|      Lap...|
|    Mouse|      Mou...|
|   Tablet|      Tab...|
|Bookshelf|      Boo...|
|    Mixer|      Mix...|
| Notebook|      Not...|
|    Phone|      Pho...|
+---------+------------+



8. Parquet & Views

Save the joined result as a Parquet file.

In [35]:
parquet_path = "/tmp/joined_customers_orders.parquet"

joined_df.write.mode("overwrite").parquet(parquet_path)

Read it back and verify schema.

In [36]:
parquet_df = spark.read.parquet(parquet_path)
parquet_df.printSchema()
parquet_df.show(5)

root
 |-- CustomerID: integer (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerType: string (nullable = true)

+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+------------+
|CustomerID|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType| Name|     City|SignupDate|CustomerType|
+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+------------+
|   

Create and query a global temp view.

In [37]:
parquet_df.createGlobalTempView("global_joined_view")

spark.sql("SELECT * FROM global_temp.global_joined_view LIMIT 5").show()

+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+------------+
|CustomerID|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType| Name|     City|SignupDate|CustomerType|
+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+------------+
|       101|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|  Ali|   Mumbai|2022-05-10|         New|
|       101|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|  Ali|   Mumbai|2022-05-10|         New|
|       102|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High| Neha|    Delhi|2023-01-15|         New|
|       103|      4|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|      Low| Ravi|Bangalore|2021-11-01|       Loyal|

Compare performance between CSV read and Parquet read.

In [38]:
import time

start_csv = time.time()
csv_df = spark.read.option("header", True).option("inferSchema", True).csv(customers_path)
csv_df.count()
end_csv = time.time()
print(f"CSV read time: {end_csv - start_csv:.3f} seconds")

start_parquet = time.time()
parquet_df = spark.read.parquet(parquet_path)
parquet_df.count()
end_parquet = time.time()
print(f"Parquet read time: {end_parquet - start_parquet:.3f} seconds")

CSV read time: 1.504 seconds
Parquet read time: 0.593 seconds
