In [1]:
# Install PySpark
!pip install pyspark

# Import libraries and initialize Spark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd

spark = SparkSession.builder \
    .appName("SalesDataAnalysis") \
    .getOrCreate()

# Upload your file
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Load the dataset
df = spark.read.csv(file_name, header=True, inferSchema=True)



Saving Sales_Dataset__500_Records_.csv to Sales_Dataset__500_Records_.csv


In [2]:
print("=== Exercise 1 ===")
df.show(5)
df.orderBy(F.col("OrderID").desc()).show(5)
df.printSchema()

=== Exercise 1 ===
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|

In [3]:
print("=== Exercise 2 ===")
df_selected = df.select("OrderID", "CustomerName", "Amount").withColumnRenamed("Amount", "OrderAmount")
df_selected.show(5)

df_filtered = df_selected.filter(F.col("OrderAmount") > 500)
df_filtered.show(5)

df.filter(F.col("City") == "New Jamesside").show(5)

=== Exercise 2 ===
+-------+--------------+-----------+
|OrderID|  CustomerName|OrderAmount|
+-------+--------------+-----------+
|   2824| Donald Walker|     783.04|
|   7912|  Brandon Hall|      905.0|
|   4611|  Donald Booth|     657.96|
|   3547|Phillip Garcia|     606.89|
|   8527|  Valerie Gray|      77.87|
+-------+--------------+-----------+
only showing top 5 rows

+-------+------------------+-----------+
|OrderID|      CustomerName|OrderAmount|
+-------+------------------+-----------+
|   2824|     Donald Walker|     783.04|
|   7912|      Brandon Hall|      905.0|
|   4611|      Donald Booth|     657.96|
|   3547|    Phillip Garcia|     606.89|
|   6155|Jonathan Wilkerson|     882.68|
+-------+------------------+-----------+
only showing top 5 rows

+-------+------------+---------------+------+----------+--------------+--------+-------------+-----------+-------------+
|OrderID|CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|         City|PaymentMode|Cu

In [5]:
print("=== Exercise 3 ===")
df_manipulated = df.drop("CustomerSince") \
    .withColumn("FinalAmount", F.col("Amount") - (F.col("Amount") * F.col("Discount"))) \
    .orderBy(F.col("FinalAmount").desc()) \
    .withColumn("DeliveryStatus",
               F.when(F.col("DeliveryStatus") == "Cancelled", "Order Cancelled")
                .otherwise(F.col("DeliveryStatus")))
df_manipulated.show(5)

=== Exercise 3 ===
+-------+--------------+---------------+------+----------+---------------+--------+------------+-----------+-----------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate| DeliveryStatus|Discount|        City|PaymentMode|      FinalAmount|
+-------+--------------+---------------+------+----------+---------------+--------+------------+-----------+-----------------+
|   5573|Jordan Frazier|          Books|981.05|2025-03-19|Order Cancelled|    0.02| Sheilaville|       Cash|          961.429|
|   8474|   Heidi Brown|    Electronics|968.91|2023-11-23|Order Cancelled|    0.02|  Riverafort|       Cash|         949.5318|
|   8889|   Karen Garza|          Books| 998.3|2024-10-17|Order Cancelled|    0.06|  Johnsonton|Credit Card|938.4019999999999|
|   2127|  Jaclyn Moore|      Groceries|933.32|2025-03-11|       Returned|    0.01| Cherylhaven|       Cash|         923.9868|
|   9806| Samantha Gill|      Groceries|993.17|2024-11-12|Order Cancelled|    0.07|New Seans

In [6]:
print("=== Exercise 4 ===")
df.groupBy("DeliveryStatus").count().show()
df.groupBy("ProductCategory").agg(F.avg("Amount").alias("AvgAmount")).show()
df.groupBy("City").agg(F.sum("Amount").alias("TotalSales")).orderBy(F.col("TotalSales").desc()).show(10)

=== Exercise 4 ===
+--------------+-----+
|DeliveryStatus|count|
+--------------+-----+
|      Returned|  117|
|     Cancelled|  149|
|     Delivered|  119|
|       Pending|  115|
+--------------+-----+

+---------------+------------------+
|ProductCategory|         AvgAmount|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+

+----------------+------------------+
|            City|        TotalSales|
+----------------+------------------+
|       Smithside|           1842.13|
|       Meganside|1607.8899999999999|
|    West Jessica|           1249.33|
|   Armstrongfort|           1226.18|
|     East Robert|            1194.5|
|West Christopher|           1144.37|
|       Lake John|           1141.28|
|    South Andrea|           1048.53|
|    Smithborough|           1026.98|
|    G

In [9]:
print("=== Exercise 5 ===")
# Inject nulls if none exist
if df.filter(F.col("City").isNull()).count() == 0:
    df = df.withColumn("City", F.when(F.rand() > 0.9, F.lit(None)).otherwise(F.col("City")))

# Show handling options
df.fillna("Unknown", subset=["City"]).show(5)
df.dropna(subset=["City"]).show(5)
df.withColumn("HighValue", F.when(F.col("Amount") > 800, "Yes").otherwise("No")).show(5)
df.fillna("Unknown", subset=["City"]).show(5)
df.dropna(subset=["City"]).show(5)
df.withColumn("HighValue", F.when(F.col("Amount") > 800, "Yes").otherwise("No")).show(5)

=== Exercise 5 ===
+-------+--------------+---------------+------+----------+--------------+--------+-------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|         City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+-------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15| Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01| Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|      Unknown|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|    Mariastad|       Cash|   2022-11-1

In [10]:
print("=== Exercise 6 ===")
df_dates = df.withColumn("OrderDate", F.to_date(F.col("OrderDate"))) \
    .withColumn("OrderYear", F.year("OrderDate")) \
    .withColumn("OrderMonth", F.month("OrderDate")) \
    .withColumn("LoyaltyYears", F.datediff(F.current_date(), F.to_date(F.col("CustomerSince"))) / 365)
df_dates.select("OrderID", "OrderYear", "OrderMonth", "LoyaltyYears").show(5)

=== Exercise 6 ===
+-------+---------+----------+------------------+
|OrderID|OrderYear|OrderMonth|      LoyaltyYears|
+-------+---------+----------+------------------+
|   2824|     2024|        12| 4.635616438356164|
|   7912|     2024|         9| 3.221917808219178|
|   4611|     2025|         1| 3.824657534246575|
|   3547|     2024|         3| 4.821917808219178|
|   8527|     2024|         8|2.5506849315068494|
+-------+---------+----------+------------------+
only showing top 5 rows



In [11]:
print("=== Exercise 7 ===")
# Create region mapping
regions = spark.createDataFrame(
    [(city, ["North","South","East","West"][i%4])
     for i, city in enumerate(df.select("City").distinct().rdd.flatMap(lambda x: x).collect())],
    ["City", "Region"])

# Join examples
df.join(regions, "City", "inner").show(5)
df.join(regions, "City", "left").show(5)

# Union example
df.filter(F.year(F.to_date(F.col("OrderDate"))) == 2023).union(
   df.filter(F.year(F.to_date(F.col("OrderDate"))) == 2024)).count()

=== Exercise 7 ===
+----------------+-------+------------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
|            City|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|Region|
+----------------+-------+------------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
|     Ramseymouth|   4613|     Don Tucker MD|           Toys|761.06|2024-11-03|      Returned|    0.03|Credit Card|   2020-12-17| North|
|East Edwardshire|   8827|    Austin Vasquez|          Books|291.26|2024-08-24|      Returned|    0.21|     Wallet|   2021-09-16| South|
|      Thomasberg|   6155|Jonathan Wilkerson|        Fashion|882.68|2024-10-14|     Cancelled|    0.27|       Cash|   2021-06-20|  East|
|     Laurenville|   1046|    William Huerta|      Groceries|383.26|2024-07-26|     Delivered|    0.16|     Wallet|   2023-03-25|  West|
| South Colinstad|   3

401

In [13]:
print("=== Exercise 8 ===")
df_json = df.withColumn("OrderJSON", F.to_json(F.struct([F.col(c) for c in df.columns])))
df_json.select("OrderJSON").show(2, truncate=False)

# Access JSON fields
# Read the JSON strings into a new DataFrame
df_parsed_json = spark.read.json(df_json.select("OrderJSON").rdd.map(lambda x: x[0]))

# Select the desired field (CustomerName) directly from the parsed DataFrame
df_parsed_json.select("CustomerName").alias("Name").show(5)

=== Exercise 8 ===
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|OrderJSON                                                                                                                                                                                                                                    |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"OrderID":2824,"CustomerName":"Donald Walker","ProductCategory":"Books","Amount":783.04,"OrderDate":"2024-12-26","DeliveryStatus":"Returned","Discount":0.15,"City":"Lake Joyside","PaymentMode":"Credit Card","CustomerSince":"2020-10-15"}|
|{"OrderID":7912,"Cus

In [14]:
print("=== Exercise 9 ===")
categorize_udf = F.udf(lambda amt: "Big" if amt>800 else "Medium" if amt>500 else "Small")
df.withColumn("OrderSize", categorize_udf(F.col("Amount"))).select("OrderID", "Amount", "OrderSize").show(10)

=== Exercise 9 ===
+-------+------+---------+
|OrderID|Amount|OrderSize|
+-------+------+---------+
|   2824|783.04|   Medium|
|   7912| 905.0|      Big|
|   4611|657.96|   Medium|
|   3547|606.89|   Medium|
|   8527| 77.87|    Small|
|   4150|352.37|    Small|
|   5554|148.33|    Small|
|   2169| 14.09|    Small|
|   6313| 79.83|    Small|
|   6155|882.68|      Big|
+-------+------+---------+
only showing top 10 rows



In [15]:
spark.stop()