In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()
df_spark = spark.read.option("header", True).option("inferSchema", True).csv("Sales_Dataset__500_Records_.csv")
df_spark.show(5)
df_spark.tail(5)
df_spark.printSchema()
import dask.dataframe as dd



+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

In [13]:


#PySpark
from pyspark.sql.functions import col

df_selected = df_spark.select("OrderID", "CustomerName", "Amount")
df_renamed = df_selected.withColumnRenamed("Amount", "OrderAmount")
df_filtered = df_spark.filter(col("Amount") > 500)
df_city = df_spark.filter(col("City") == "New York")



In [15]:
from pyspark.sql.functions import expr

df_mod = df_spark.drop("CustomerSince")
df_mod = df_mod.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))
df_sorted = df_mod.orderBy(col("FinalAmount").desc())
df_replaced = df_mod.withColumn("DeliveryStatus", expr("CASE WHEN DeliveryStatus = 'Cancelled' THEN 'Order Cancelled' ELSE DeliveryStatus END"))


In [16]:
df_spark.groupBy("DeliveryStatus").count().show()
df_spark.groupBy("ProductCategory").avg("Amount").show()
df_spark.groupBy("City").sum("Amount").show()


+--------------+-----+
|DeliveryStatus|count|
+--------------+-----+
|      Returned|  117|
|     Cancelled|  149|
|     Delivered|  119|
|       Pending|  115|
+--------------+-----+

+---------------+------------------+
|ProductCategory|       avg(Amount)|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+

+----------------+-----------+
|            City|sum(Amount)|
+----------------+-----------+
|     Ramseymouth|     761.06|
|East Edwardshire|     291.26|
|      Thomasberg|     882.68|
|     Laurenville|     383.26|
| South Colinstad|     786.27|
|    Lake Douglas|     975.09|
|   Williamsmouth|      10.78|
|      Gordonport|     514.99|
|  West Dawnmouth|       12.8|
|        Seanbury|     814.39|
|     Sheilaville|     981.05|
|       Mollybury|     222.02|
|       Lisavill

In [17]:
from pyspark.sql.functions import when

df_null = df_spark.withColumn("City", when(col("OrderID") % 50 == 0, None).otherwise(col("City")))
df_filled = df_null.fillna({"City": "Unknown"})

df_tagged = df_spark.withColumn("CustomerType", when(col("Amount") > 800, "High-Value").otherwise("Regular"))


In [18]:
from pyspark.sql.functions import year, month, current_date, datediff

df_spark = df_spark.withColumn("Year", year(col("OrderDate")))
df_spark = df_spark.withColumn("Month", month(col("OrderDate")))
df_spark = df_spark.withColumn("LoyaltyYears", (datediff(current_date(), col("CustomerSince")) / 365).cast("int"))


In [19]:
region_data = [("New York", "East"), ("Los Angeles", "West")]
region_df = spark.createDataFrame(region_data, ["City", "Region"])

df_joined_inner = df_spark.join(region_df, on="City", how="inner")
df_joined_left = df_spark.join(region_df, on="City", how="left")

df_2023 = df_spark.filter(year("OrderDate") == 2023)
df_2024 = df_spark.filter(year("OrderDate") == 2024)
df_union = df_2023.union(df_2024)


In [20]:
from pyspark.sql.functions import to_json, struct, from_json, explode, get_json_object
from pyspark.sql.types import StringType

json_df = df_spark.withColumn("json_data", to_json(struct([col(x) for x in df_spark.columns])))
parsed_df = spark.read.json(json_df.rdd.map(lambda row: row["json_data"]))


In [21]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def tag_order(amount):
    if amount > 800:
        return "Big"
    elif amount > 400:
        return "Medium"
    else:
        return "Small"

tag_udf = udf(tag_order, StringType())
df_tagged = df_spark.withColumn("OrderTag", tag_udf(col("Amount")))
