In [1]:
# Install PySpark
!pip install pyspark

# Import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Create Spark session
spark = SparkSession.builder.appName("PySparkMasterTaskSet").getOrCreate()




In [2]:
# Set your paths
customer_path = "/content/drive/MyDrive/customers.csv"
order_path = "/content/drive/MyDrive/orders.csv"

# Load CSVs with schema inference
customers_df = spark.read.csv(customer_path, header=True, inferSchema=True)
orders_df = spark.read.csv(order_path, header=True, inferSchema=True)


In [3]:
# Schema & Counts
customers_df.printSchema()
orders_df.printSchema()

print("Total Customers:", customers_df.count())
print("Total Orders:", orders_df.count())

customers_df.select("City").distinct().show()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)

Total Customers: 5
Total Orders: 7
+---------+
|     City|
+---------+
|Bangalore|
|  Chennai|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [4]:
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity")) \
                     .withColumn("OrderYear", year("OrderDate"))

orders_df.filter(col("TotalAmount") > 10000).show()

customers_df = customers_df.drop("Email")


+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+



In [7]:
from pyspark.sql.functions import when, col

# Simulate null and fill with 'Unknown'
customers_df = customers_df.withColumn("City", when(col("CustomerID") == 102, None).otherwise(col("City")))
customers_df = customers_df.fillna({"City": "Unknown"})

# Label as Loyal or New
customers_df = customers_df.withColumn("CustomerType", when(col("SignupDate") < "2022-01-01", "Loyal").otherwise("New"))

# Show result
customers_df.select("CustomerID", "Name", "City", "SignupDate", "CustomerType").show()
# OrderType column in orders
orders_df = orders_df.withColumn("OrderType", when(col("TotalAmount") < 5000, "Low").otherwise("High"))

# Show result
orders_df.select("OrderID", "Product", "TotalAmount", "OrderType").show()


+----------+-----+---------+----------+------------+
|CustomerID| Name|     City|SignupDate|CustomerType|
+----------+-----+---------+----------+------------+
|       101|  Ali|   Mumbai|2022-05-10|         New|
|       102| Neha|  Unknown|2023-01-15|         New|
|       103| Ravi|Bangalore|2021-11-01|       Loyal|
|       104|Sneha|Hyderabad|2020-07-22|       Loyal|
|       105| Amit|  Chennai|2023-03-10|         New|
+----------+-----+---------+----------+------------+

+-------+---------+-----------+---------+
|OrderID|  Product|TotalAmount|OrderType|
+-------+---------+-----------+---------+
|      1|   Laptop|   100000.0|     High|
|      2|    Mouse|     1200.0|      Low|
|      3|   Tablet|    20000.0|     High|
|      4|Bookshelf|     3500.0|      Low|
|      5|    Mixer|     5000.0|     High|
|      6| Notebook|     2500.0|      Low|
|      7|    Phone|    30000.0|     High|
+-------+---------+-----------+---------+



In [8]:
joined_df = orders_df.join(customers_df, on="CustomerID", how="inner")

# Orders & revenue per city
joined_df.groupBy("City").agg(count("*").alias("TotalOrders"), sum("TotalAmount").alias("Revenue")).show()

# Top 3 spenders
joined_df.groupBy("Name").agg(sum("TotalAmount").alias("TotalSpend")).orderBy(desc("TotalSpend")).show(3)

# Products per category
orders_df.groupBy("Category").agg(sum("Quantity").alias("TotalProductsSold")).show()


+---------+-----------+--------+
|     City|TotalOrders| Revenue|
+---------+-----------+--------+
|Bangalore|          1|  3500.0|
|  Chennai|          1|  2500.0|
|   Mumbai|          2|101200.0|
|  Unknown|          2| 50000.0|
|Hyderabad|          1|  5000.0|
+---------+-----------+--------+

+-----+----------+
| Name|TotalSpend|
+-----+----------+
|  Ali|  101200.0|
| Neha|   50000.0|
|Sneha|    5000.0|
+-----+----------+
only showing top 3 rows

+-----------+-----------------+
|   Category|TotalProductsSold|
+-----------+-----------------+
| Stationery|                5|
|Electronics|                5|
|  Furniture|                1|
| Appliances|                1|
+-----------+-----------------+



In [9]:
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")

# Orders by customers from Delhi
spark.sql("""
SELECT o.* FROM sales.orders o
JOIN sales.customers c ON o.CustomerID = c.CustomerID
WHERE c.City = 'Delhi'
""").show()

# Average value per category
spark.sql("""
SELECT Category, AVG(TotalAmount) as AvgValue FROM sales.orders GROUP BY Category
""").show()

# Monthly order view
spark.sql("""
CREATE OR REPLACE VIEW monthly_orders AS
SELECT MONTH(OrderDate) as Month, SUM(TotalAmount) as MonthlyTotal
FROM sales.orders GROUP BY MONTH(OrderDate)
""")

spark.sql("SELECT * FROM monthly_orders").show()


+-------+----------+-------+--------+--------+-----+---------+-----------+---------+---------+
|OrderID|CustomerID|Product|Category|Quantity|Price|OrderDate|TotalAmount|OrderYear|OrderType|
+-------+----------+-------+--------+--------+-----+---------+-----------+---------+---------+
+-------+----------+-------+--------+--------+-----+---------+-----------+---------+---------+

+-----------+--------+
|   Category|AvgValue|
+-----------+--------+
| Stationery|  2500.0|
|Electronics| 37800.0|
|  Furniture|  3500.0|
| Appliances|  5000.0|
+-----------+--------+

+-----+------------+
|Month|MonthlyTotal|
+-----+------------+
|    1|    101200.0|
|    3|     32500.0|
|    2|     28500.0|
+-----+------------+



In [10]:
# Mask email
masked_df = spark.read.csv(customer_path, header=True, inferSchema=True)
masked_df = masked_df.withColumn("MaskedEmail", regexp_replace("Email", r"(^\w)[^@]*", r"$1***"))
masked_df.select("Email", "MaskedEmail").show()

# Concatenate name and city
customers_df = customers_df.withColumn("NameCity", concat_ws(" from ", "Name", "City"))
customers_df.select("NameCity").show()

# Age in days
customers_df = customers_df.withColumn("CustomerAge", datediff(current_date(), "SignupDate"))
customers_df.select("Name", "CustomerAge").show()

# Month name
orders_df = orders_df.withColumn("MonthName", date_format("OrderDate", "MMMM"))
orders_df.select("OrderDate", "MonthName").show()


+-----------------+----------------+
|            Email|     MaskedEmail|
+-----------------+----------------+
|    ali@gmail.com|  a***@gmail.com|
|   neha@yahoo.com|  n***@yahoo.com|
| ravi@hotmail.com|r***@hotmail.com|
|sneha@outlook.com|s***@outlook.com|
|   amit@gmail.com|  a***@gmail.com|
+-----------------+----------------+

+--------------------+
|            NameCity|
+--------------------+
|     Ali from Mumbai|
|   Neha from Unknown|
| Ravi from Bangalore|
|Sneha from Hyderabad|
|   Amit from Chennai|
+--------------------+

+-----+-----------+
| Name|CustomerAge|
+-----+-----------+
|  Ali|       1126|
| Neha|        876|
| Ravi|       1316|
|Sneha|       1783|
| Amit|        822|
+-----+-----------+

+----------+---------+
| OrderDate|MonthName|
+----------+---------+
|2024-01-10|  January|
|2024-01-15|  January|
|2024-02-01| February|
|2024-02-10| February|
|2024-02-15| February|
|2024-03-01|    March|
|2024-03-02|    March|
+----------+---------+



In [11]:
# UDF for spend category
def tag_customer(spend):
    if spend > 50000:
        return "Gold"
    elif spend >= 10000:
        return "Silver"
    else:
        return "Bronze"

spend_udf = udf(tag_customer, StringType())

customer_spend = joined_df.groupBy("CustomerID", "Name").agg(sum("TotalAmount").alias("TotalSpend"))
customer_spend = customer_spend.withColumn("Tier", spend_udf("TotalSpend"))
customer_spend.show()

# UDF to shorten product names
shorten_udf = udf(lambda p: p[:3] + "..." if p else "", StringType())
orders_df = orders_df.withColumn("ShortProduct", shorten_udf("Product"))
orders_df.select("Product", "ShortProduct").show()


+----------+-----+----------+------+
|CustomerID| Name|TotalSpend|  Tier|
+----------+-----+----------+------+
|       105| Amit|    2500.0|Bronze|
|       104|Sneha|    5000.0|Bronze|
|       101|  Ali|  101200.0|  Gold|
|       102| Neha|   50000.0|Silver|
|       103| Ravi|    3500.0|Bronze|
+----------+-----+----------+------+

+---------+------------+
|  Product|ShortProduct|
+---------+------------+
|   Laptop|      Lap...|
|    Mouse|      Mou...|
|   Tablet|      Tab...|
|Bookshelf|      Boo...|
|    Mixer|      Mix...|
| Notebook|      Not...|
|    Phone|      Pho...|
+---------+------------+



In [12]:
# Save Parquet
joined_df.write.mode("overwrite").parquet("/content/drive/MyDrive/pyspark_data/joined_data.parquet")

# Read & verify
parquet_df = spark.read.parquet("/content/drive/MyDrive/pyspark_data/joined_data.parquet")
parquet_df.printSchema()

# Temp view
parquet_df.createOrReplaceGlobalTempView("joined_view")
spark.sql("SELECT * FROM global_temp.joined_view LIMIT 5").show()

# Compare performance
import time

start = time.time()
csv_df = spark.read.csv(order_path, header=True, inferSchema=True)
csv_df.count()
print("CSV read time:", time.time() - start)

start = time.time()
parquet_df.count()
print("Parquet read time:", time.time() - start)


root
 |-- CustomerID: integer (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderType: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- CustomerType: string (nullable = true)

+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+------------+
|CustomerID|OrderID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderType| Name|     City|SignupDate|CustomerType|
+----------+-------+---------+-----------+--------+-------+----------+-----------+---------+---------+-----+---------+----------+------------+
|   