In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("PysparkAssessment1")\
.getOrCreate()

spark

In [36]:
# Data Ingestion & Exploration
# Load both CSV files with schema inference.
from google.colab import drive
drive.mount('/content/drive')

customer_df = spark.read.csv('/content/drive/MyDrive/customers.csv',header= True,inferSchema=True)
orders_df = spark.read.csv('/content/drive/MyDrive/orders.csv',header= True,inferSchema=True)
customer_df.show(5)
orders_df.show(5)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+

+-------+----------+---------+-----------+--------+-------+----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|
+-------+----------+---------+-----------+--------+-------+----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-

In [4]:
# List all columns and data types.

print("Customers Schema:")
customer_df.printSchema()

print("Orders Schema:")
orders_df.printSchema()

Customers Schema:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

Orders Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)



In [5]:
# Count the total number of customers and orders.

total_customers = customer_df.count()
total_orders = orders_df.count()

print("Total number of customers:",total_customers)
print("Total number of orders:",total_orders)

Total number of customers: 5
Total number of orders: 7


In [6]:
# Show distinct cities.

dis_city = customer_df.select("City").distinct()
dis_city.show()

+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [7]:
# DataFrame Transformations
# Add a column TotalAmount = Price * Quantity

orders_df = orders_df.withColumn("TotalAmount",orders_df["Price"] * orders_df["Quantity"])
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+



In [9]:
# Create a new column OrderYear from OrderDate .

from pyspark.sql.functions import year

orders_df = orders_df.withColumn("OrderYear",year(orders_df['OrderDate']))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

In [10]:
# Filter orders with TotalAmount > 10,000 .

gratervalue = orders_df.filter(orders_df['TotalAmount'] > 10000)
gratervalue.show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+



In [11]:
# Drop the Email column from customers .

customer_df = customer_df.drop('Email')
customer_df.show()

+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|2022-05-10|
|       102| Neha|    Delhi|2023-01-15|
|       103| Ravi|Bangalore|2021-11-01|
|       104|Sneha|Hyderabad|2020-07-22|
|       105| Amit|  Chennai|2023-03-10|
+----------+-----+---------+----------+



In [12]:
# Handling Nulls & Conditionals
# Simulate a null in City and fill it with “Unknown”.

from pyspark.sql.functions import when

customer_df = customer_df.withColumn("City",when(customer_df['CustomerID'] == 103,None).otherwise(customer_df["City"]))

customer_df = customer_df.fillna({"City":"Unknown"})
customer_df.show()

+----------+-----+---------+----------+
|CustomerID| Name|     City|SignupDate|
+----------+-----+---------+----------+
|       101|  Ali|   Mumbai|2022-05-10|
|       102| Neha|    Delhi|2023-01-15|
|       103| Ravi|  Unknown|2021-11-01|
|       104|Sneha|Hyderabad|2020-07-22|
|       105| Amit|  Chennai|2023-03-10|
+----------+-----+---------+----------+



In [13]:
# Label customers as “Loyal” if SignupDate is before 2022, else “New”.

from pyspark.sql.functions import when

customer_df = customer_df.withColumn("CustomerLable",when(customer_df["SignupDate"] < "2022-01-01","Loyal").otherwise("New"))
customer_df.show()

+----------+-----+---------+----------+-------------+
|CustomerID| Name|     City|SignupDate|CustomerLable|
+----------+-----+---------+----------+-------------+
|       101|  Ali|   Mumbai|2022-05-10|          New|
|       102| Neha|    Delhi|2023-01-15|          New|
|       103| Ravi|  Unknown|2021-11-01|        Loyal|
|       104|Sneha|Hyderabad|2020-07-22|        Loyal|
|       105| Amit|  Chennai|2023-03-10|          New|
+----------+-----+---------+----------+-------------+



In [14]:
# Create OrderType column: "Low" if < 5,000, "High" if >5,000.

from pyspark.sql.functions import when

orders_df = orders_df.withColumn("OrderType",when(orders_df['Price'] < 5000,"Low").otherwise("High"))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|      Low|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|     High|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|      Low|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|   

In [15]:
# Joins & Aggregations
# Join customers and orders on CustomerID .

joined_df = orders_df.join(customer_df,on = "CustomerID",how = "inner")
joined_df.show()

+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+-------------+
|CustomerID|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType| Name|     City|SignupDate|CustomerLable|
+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+-------------+
|       101|      1|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|     High|  Ali|   Mumbai|2022-05-10|          New|
|       101|      2|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|      Low|  Ali|   Mumbai|2022-05-10|          New|
|       102|      3|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High| Neha|    Delhi|2023-01-15|          New|
|       103|      4|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|      Low| Ravi|  Unknown|2021-11-01|       

In [16]:
# Get total orders and revenue per city.
from pyspark.sql.functions import count, sum

city_total = joined_df.groupBy("City").agg(count("OrderID"),sum("TotalAmount"))
city_total.show()

+---------+--------------+----------------+
|     City|count(OrderID)|sum(TotalAmount)|
+---------+--------------+----------------+
|  Chennai|             1|          2500.0|
|   Mumbai|             2|        101200.0|
|  Unknown|             1|          3500.0|
|    Delhi|             2|         50000.0|
|Hyderabad|             1|          5000.0|
+---------+--------------+----------------+



In [19]:
# Show top 3 customers by total spend.

from pyspark.sql.functions import col

top_3_customer = joined_df.groupBy("CustomerID","Name").agg(sum("TotalAmount").alias("TotalSpent")).orderBy(col("TotalSpent").desc()).limit(3)
top_3_customer.show()

+----------+-----+----------+
|CustomerID| Name|TotalSpent|
+----------+-----+----------+
|       101|  Ali|  101200.0|
|       102| Neha|   50000.0|
|       104|Sneha|    5000.0|
+----------+-----+----------+



In [22]:
# Count how many products each category has sold.

from pyspark.sql.functions import sum

category_sold = joined_df.groupBy("Category").agg(sum("Quantity").alias("TotalProductSold"))
category_sold.show()

+-----------+----------------+
|   Category|TotalProductSold|
+-----------+----------------+
| Stationery|               5|
|Electronics|               5|
|  Furniture|               1|
| Appliances|               1|
+-----------+----------------+



In [23]:
# Spark SQL Tasks
# Create database sales and switch to it.

spark.sql("create database sales")

DataFrame[]

In [25]:
# Save both datasets as tables in the sales database.

customer_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

In [34]:
# Write SQL to:
# List all orders by customers from “Delhi”.
spark.sql("""select o.*
from sales.orders o
join sales.customers c
on o.CustomerID = c.CustomerID
where c.city = 'Delhi'""").show()
# Find average order value in each category.
spark.sql("""select category,avg(totalamount) from sales.orders group by category""").show()
# Create a view monthly_orders with month-wise total amount.
spark.sql("""create or replace temp view monthly_orders as
select date_format(OrderDate,'yyyy-mm') as month,
sum(price*quantity) as TotalAmount from sales.orders
group by date_format(OrderDate,'yyyy-mm')""")

spark.sql("SELECT * FROM monthly_orders").show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|     High|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|     High|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+---------+

+-----------+----------------+
|   category|avg(totalamount)|
+-----------+----------------+
| Stationery|          2500.0|
|Electronics|         37800.0|
|  Furniture|          3500.0|
| Appliances|          5000.0|
+-----------+----------------+

+-------+-----------+
|  month|TotalAmount|
+-------+-----------+
|2024-00|   162200.0|
+-------+-----------+



In [37]:
from os import truncate
# String & Date Functions
# Mask emails using regex (e.g., a***@gmail.com ).

from pyspark.sql.functions import regexp_replace

masked_email = customer_df.withColumn("MaskedEmail",regexp_replace("Email",r"(^.)([^@]+)(@.*)", r"\1***\3"))
masked_email.show(truncate = False)

+----------+-----+-----------------+---------+----------+-----------+
|CustomerID|Name |Email            |City     |SignupDate|MaskedEmail|
+----------+-----+-----------------+---------+----------+-----------+
|101       |Ali  |ali@gmail.com    |Mumbai   |2022-05-10|1***3      |
|102       |Neha |neha@yahoo.com   |Delhi    |2023-01-15|1***3      |
|103       |Ravi |ravi@hotmail.com |Bangalore|2021-11-01|1***3      |
|104       |Sneha|sneha@outlook.com|Hyderabad|2020-07-22|1***3      |
|105       |Amit |amit@gmail.com   |Chennai  |2023-03-10|1***3      |
+----------+-----+-----------------+---------+----------+-----------+



In [39]:
# Concatenate Name and City as “Name from City”.

from pyspark.sql.functions import concat_ws

name_city = customer_df.withColumn("NameWithColumn",concat_ws("from",customer_df["Name"],customer_df["City"]))
name_city.show()

+----------+-----+-----------------+---------+----------+------------------+
|CustomerID| Name|            Email|     City|SignupDate|    NameWithColumn|
+----------+-----+-----------------+---------+----------+------------------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|     AlifromMumbai|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|     NehafromDelhi|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01| RavifromBangalore|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|SnehafromHyderabad|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|   AmitfromChennai|
+----------+-----+-----------------+---------+----------+------------------+



In [42]:
# Use datediff() to calculate customer age in days.

from pyspark.sql.functions import current_date,date_diff

customer_age_diff = customer_df.withColumn("AgeInDays",date_diff(current_date(),customer_df["SignupDate"]))
customer_age_diff.show()

+----------+-----+-----------------+---------+----------+---------+
|CustomerID| Name|            Email|     City|SignupDate|AgeInDays|
+----------+-----+-----------------+---------+----------+---------+
|       101|  Ali|    ali@gmail.com|   Mumbai|2022-05-10|     1126|
|       102| Neha|   neha@yahoo.com|    Delhi|2023-01-15|      876|
|       103| Ravi| ravi@hotmail.com|Bangalore|2021-11-01|     1316|
|       104|Sneha|sneha@outlook.com|Hyderabad|2020-07-22|     1783|
|       105| Amit|   amit@gmail.com|  Chennai|2023-03-10|      822|
+----------+-----+-----------------+---------+----------+---------+



In [43]:
# Extract month name from OrderDate .

from pyspark.sql.functions import date_format

orders_with_month = orders_df.withColumn("OrderMonth",date_format("OrderDate","MMMM"))
orders_with_month.show()

+-------+----------+---------+-----------+--------+-------+----------+----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|OrderMonth|
+-------+----------+---------+-----------+--------+-------+----------+----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   January|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|   January|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|  February|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|  February|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|  February|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     March|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|     March|
+-------+----------+---------+-----------+--------+-------+----------+----------+



In [47]:
# UDFs and Complex Logic
# Write a UDF to tag customers:
# “Gold” if spend >50K, “Silver” if 10K–50K, “Bronze” if <10K.

from pyspark.sql.functions import udf, sum, col
from pyspark.sql.types import StringType

def tag_customer(spend):
  if spend > 50000:
    return "Gold"
  elif spend >= 10000:
    return "Silver"
  else:
    return "Bronze"

tag_udf = udf(tag_customer,StringType())

# Write a UDF to shorten product names (first 3 letters + ...).

customer_spend_df = orders_df.withColumn("Spend", col("Price") * col("Quantity")) \
    .groupBy("CustomerID").agg(sum("Spend").alias("TotalSpend"))

customer_tagged_df = customer_spend_df.withColumn("Tier", tag_udf("TotalSpend"))
customer_tagged_df.show()

+----------+----------+------+
|CustomerID|TotalSpend|  Tier|
+----------+----------+------+
|       101|  101200.0|  Gold|
|       103|    3500.0|Bronze|
|       102|   50000.0|Silver|
|       105|    2500.0|Bronze|
|       104|    5000.0|Bronze|
+----------+----------+------+



In [48]:
# Parquet & Views
# Save the joined result as a Parquet file.
joined_df.write.mode("overwrite").parquet("output/joined_data.parquet")

In [49]:
# Read it back and verify schema.
parquet_df = spark.read.parquet("output/joined_data.parquet")

parquet_df.printSchema()
parquet_df.show(truncate = False)

root
 |-- CustomerID: integer (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerLable: string (nullable = true)

+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+-------------+
|CustomerID|OrderID|Product  |Category   |Quantity|Price  |OrderDate |TotalAmount|OrderYear|OrderType|Name |City     |SignupDate|CustomerLable|
+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+-------------+


In [50]:
# Create and query a global temp view.
parquet_df.createOrReplaceGlobalTempView("join_order_customer")

spark.sql("""SELECT Name, Product, TotalAmount, City
FROM global_temp.join_order_customer
WHERE TotalAmount > 10000""").show()

+----+-------+-----------+------+
|Name|Product|TotalAmount|  City|
+----+-------+-----------+------+
| Ali| Laptop|   100000.0|Mumbai|
|Neha| Tablet|    20000.0| Delhi|
|Neha|  Phone|    30000.0| Delhi|
+----+-------+-----------+------+



In [53]:
# Compare performance between CSV read and Parquet read.

import time

# Time CSV read
start_csv = time.time()
csv_df = spark.read.option("header", True).option("inferSchema", True).csv("/content/drive/MyDrive/orders.csv")
csv_df.count()
end_csv = time.time()

# Time Parquet read
start_parquet = time.time()
parquet_df = spark.read.parquet("output/joined_data.parquet")
parquet_df.count()
end_parquet = time.time()

print(f"CSV Read Time: {end_csv - start_csv:.4f} seconds")
print(f"Parquet Read Time: {end_parquet - start_parquet:.4f} seconds")

CSV Read Time: 1.7767 seconds
Parquet Read Time: 0.8025 seconds
