In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, year, month, current_date, datediff, expr, udf, explode, get_json_object
from pyspark.sql.types import StringType

# Start Spark session
spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()

# Replace with your correct path:
file_path = "/content/drive/MyDrive/Sales_Dataset__500_Records_.csv"  # For Google Colab
# file_path = "C:/Users/YourName/Documents/Sales_Dataset__500_Records_.csv"  # For local PySpark

# Load CSV
df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)


In [3]:
df.show(5)
df.tail(5)
df.printSchema()
df.dtypes


+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

[('OrderID', 'int'),
 ('CustomerName', 'string'),
 ('ProductCategory', 'string'),
 ('Amount', 'double'),
 ('OrderDate', 'date'),
 ('DeliveryStatus', 'string'),
 ('Discount', 'double'),
 ('City', 'string'),
 ('PaymentMode', 'string'),
 ('CustomerSince', 'date')]

In [6]:
# Select
selected_df = df.select("OrderID", "CustomerName", "Amount")
selected_df.show()

# Rename
renamed_df = selected_df.withColumnRenamed("Amount", "OrderAmount")
renamed_df.show()

# Filter Amount > 500
filtered_df = df.filter(col("Amount") > 500)
filtered_df.show()

# Filter from specific city (e.g., "New York")
city_filtered_df = df.filter(col("City") == "New York")
city_filtered_df.show()



+-------+------------------+------+
|OrderID|      CustomerName|Amount|
+-------+------------------+------+
|   2824|     Donald Walker|783.04|
|   7912|      Brandon Hall| 905.0|
|   4611|      Donald Booth|657.96|
|   3547|    Phillip Garcia|606.89|
|   8527|      Valerie Gray| 77.87|
|   4150|       Amber Perez|352.37|
|   5554|        Roy Martin|148.33|
|   2169|    Carolyn Daniel| 14.09|
|   6313|       Patty Perez| 79.83|
|   6155|Jonathan Wilkerson|882.68|
|   9830|       Kevin Hurst|870.55|
|   9085| Anthony Rodriguez|921.73|
|   2040|     Kyle Mcdonald|327.52|
|   6573|    Jeffrey Chavez|676.02|
|   2743|  Elizabeth Fowler| 47.06|
|   9837|     Tammy Sellers| 46.15|
|   6038|     David Bradley|348.51|
|   3060|       John Pierce|362.09|
|   4295|   Jennifer Powers|684.26|
|   5061|    George Chapman|251.89|
+-------+------------------+------+
only showing top 20 rows

+-------+------------------+-----------+
|OrderID|      CustomerName|OrderAmount|
+-------+------------------+

In [8]:
# Drop CustomerSince
df1 = df.drop("CustomerSince")
df1.show(5)  # Display top 5 rows

# Add FinalAmount column
df2 = df.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))
df2.show(5)

# Sort descending
sorted_df = df2.orderBy(col("FinalAmount").desc())
sorted_df.show(5)

# Replace Cancelled with Order Cancelled
replaced_df = df.withColumn(
    "DeliveryStatus",
    when(col("DeliveryStatus") == "Cancelled", "Order Cancelled").otherwise(col("DeliveryStatus"))
)
replaced_df.show(5)



+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----

In [9]:
# Count by DeliveryStatus
df.groupBy("DeliveryStatus").count().show()

# Average Amount by ProductCategory
df.groupBy("ProductCategory").avg("Amount").show()

# Total sales by City
df.groupBy("City").sum("Amount").withColumnRenamed("sum(Amount)", "TotalSales").show()


+--------------+-----+
|DeliveryStatus|count|
+--------------+-----+
|      Returned|  117|
|     Cancelled|  149|
|     Delivered|  119|
|       Pending|  115|
+--------------+-----+

+---------------+------------------+
|ProductCategory|       avg(Amount)|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+

+----------------+----------+
|            City|TotalSales|
+----------------+----------+
|     Ramseymouth|    761.06|
|East Edwardshire|    291.26|
|      Thomasberg|    882.68|
|     Laurenville|    383.26|
| South Colinstad|    786.27|
|    Lake Douglas|    975.09|
|   Williamsmouth|     10.78|
|      Gordonport|    514.99|
|  West Dawnmouth|      12.8|
|        Seanbury|    814.39|
|     Sheilaville|    981.05|
|       Mollybury|    222.02|
|       Lisaville|     45.69|
|

In [11]:
from pyspark.sql.functions import col, when

# Inject nulls into City
df_with_nulls = df.withColumn("City", when(col("OrderID") % 10 == 0, None).otherwise(col("City")))
df_with_nulls.show(5)

# fillna
filled_df = df_with_nulls.fillna({"City": "Unknown"})
filled_df.show(5)

# dropna
dropped_df = df_with_nulls.dropna(subset=["City"])
dropped_df.show(5)

# Tag high-value customers
tagged_df = df.withColumn("CustomerType", when(col("Amount") > 800, "High-Value").otherwise("Regular"))
tagged_df.show(5)



+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

In [13]:
from pyspark.sql.functions import year, month, datediff, current_date, col

# Extract year and month
df_date = df.withColumn("OrderYear", year("OrderDate")) \
            .withColumn("OrderMonth", month("OrderDate"))
df_date.select("OrderDate", "OrderYear", "OrderMonth").show(5)

# Customer loyalty in years
df_loyalty = df.withColumn("LoyaltyYears", (datediff(current_date(), col("CustomerSince")) / 365).cast("int"))
df_loyalty.select("CustomerName", "CustomerSince", "LoyaltyYears").show(5)



+----------+---------+----------+
| OrderDate|OrderYear|OrderMonth|
+----------+---------+----------+
|2024-12-26|     2024|        12|
|2024-09-12|     2024|         9|
|2025-01-12|     2025|         1|
|2024-03-24|     2024|         3|
|2024-08-04|     2024|         8|
+----------+---------+----------+
only showing top 5 rows

+--------------+-------------+------------+
|  CustomerName|CustomerSince|LoyaltyYears|
+--------------+-------------+------------+
| Donald Walker|   2020-10-15|           4|
|  Brandon Hall|   2022-03-15|           3|
|  Donald Booth|   2021-08-07|           3|
|Phillip Garcia|   2020-08-08|           4|
|  Valerie Gray|   2022-11-15|           2|
+--------------+-------------+------------+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import year

# Region DataFrame
region_data = [("New York", "East"), ("Los Angeles", "West"), ("Chicago", "Midwest")]
region_df = spark.createDataFrame(region_data, ["City", "Region"])
region_df.show()

# Inner Join
inner_join = df.join(region_df, on="City", how="inner")
inner_join.show(5)

# Left Join
left_join = df.join(region_df, on="City", how="left")
left_join.show(5)

# Union orders from 2023 and 2024
orders_2023 = df.filter(year("OrderDate") == 2023)
orders_2024 = df.filter(year("OrderDate") == 2024)
union_df = orders_2023.union(orders_2024)
union_df.show(5)


+-----------+-------+
|       City| Region|
+-----------+-------+
|   New York|   East|
|Los Angeles|   West|
|    Chicago|Midwest|
+-----------+-------+

+----+-------+------------+---------------+------+---------+--------------+--------+-----------+-------------+------+
|City|OrderID|CustomerName|ProductCategory|Amount|OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|Region|
+----+-------+------------+---------------+------+---------+--------------+--------+-----------+-------------+------+
+----+-------+------------+---------------+------+---------+--------------+--------+-----------+-------------+------+

+----------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
|            City|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|Region|
+----------------+-------+--------------+---------------+------+----------+--------------+--------+-------

In [15]:
from pyspark.sql.functions import to_json, from_json, schema_of_json, expr

# Convert each row to JSON
json_df = df.withColumn("json_col", to_json(expr("struct(*)")))
json_df.select("json_col").show(5, truncate=False)

# Infer schema and parse JSON back
json_schema = schema_of_json(json_df.select("json_col").first()["json_col"])
df_from_json = json_df.select(from_json("json_col", json_schema).alias("data")).select("data.*")
df_from_json.show(5)


+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|json_col                                                                                                                                                                                                                                       |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"OrderID":2824,"CustomerName":"Donald Walker","ProductCategory":"Books","Amount":783.04,"OrderDate":"2024-12-26","DeliveryStatus":"Returned","Discount":0.15,"City":"Lake Joyside","PaymentMode":"Credit Card","CustomerSince":"2020-10-15"}  |
|{"OrderID":7912,"CustomerName":

In [16]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define function
def tag_order(amount):
    if amount > 800:
        return "Big"
    elif amount > 400:
        return "Medium"
    else:
        return "Small"

# Register as UDF
tag_udf = udf(tag_order, StringType())

# Apply UDF
tagged_orders_df = df.withColumn("OrderSize", tag_udf(col("Amount")))
tagged_orders_df.select("OrderID", "Amount", "OrderSize").show(5)


+-------+------+---------+
|OrderID|Amount|OrderSize|
+-------+------+---------+
|   2824|783.04|   Medium|
|   7912| 905.0|      Big|
|   4611|657.96|   Medium|
|   3547|606.89|   Medium|
|   8527| 77.87|    Small|
+-------+------+---------+
only showing top 5 rows

