1. DataFrame Creation and Inspection

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
file_path = "/content/drive/MyDrive/Sales_Dataset__500_Records_.csv"


Pandas

In [3]:
import pandas as pd

pdf = pd.read_csv(file_path)

print("Pandas - First 5 rows:")
print(pdf.head())

print("\nPandas - Last 5 rows:")
print(pdf.tail())

print("\nPandas - Data Types:")
print(pdf.dtypes)


Pandas - First 5 rows:
   OrderID    CustomerName ProductCategory  Amount   OrderDate DeliveryStatus  \
0     2824   Donald Walker           Books  783.04  2024-12-26       Returned   
1     7912    Brandon Hall       Groceries  905.00  2024-09-12      Cancelled   
2     4611    Donald Booth         Fashion  657.96  2025-01-12       Returned   
3     3547  Phillip Garcia         Fashion  606.89  2024-03-24       Returned   
4     8527    Valerie Gray            Toys   77.87  2024-08-04      Delivered   

   Discount              City  PaymentMode CustomerSince  
0      0.15      Lake Joyside  Credit Card    2020-10-15  
1      0.03     New Jamesside       Wallet    2022-03-15  
2      0.01      Lake Roberto       Wallet    2021-08-07  
3      0.15  West Melanieview       Wallet    2020-08-08  
4      0.17         Mariastad         Cash    2022-11-15  

Pandas - Last 5 rows:
     OrderID     CustomerName ProductCategory  Amount   OrderDate  \
495     2930     Jaime Harris         Fashio

Pyspark

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SalesDataset").getOrCreate()

sdf = spark.read.csv(file_path, header=True, inferSchema=True)

print("PySpark - First 5 rows:")
sdf.show(5)

print("\nPySpark - Last 5 rows (approx):")
sdf.tail(5)

print("\nPySpark - Schema:")
sdf.printSchema()


PySpark - First 5 rows:
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Maria

Dask

In [7]:
import dask.dataframe as dd

ddf = dd.read_csv(file_path)

print("Dask - First 5 rows:")
print(ddf.head(5))

print("\nDask - Last 5 rows:")
print(ddf.tail(5))

print("\nDask - Data Types:")
print(ddf.dtypes)


Dask - First 5 rows:
   OrderID    CustomerName ProductCategory  Amount   OrderDate DeliveryStatus  \
0     2824   Donald Walker           Books  783.04  2024-12-26       Returned   
1     7912    Brandon Hall       Groceries  905.00  2024-09-12      Cancelled   
2     4611    Donald Booth         Fashion  657.96  2025-01-12       Returned   
3     3547  Phillip Garcia         Fashion  606.89  2024-03-24       Returned   
4     8527    Valerie Gray            Toys   77.87  2024-08-04      Delivered   

   Discount              City  PaymentMode CustomerSince  
0      0.15      Lake Joyside  Credit Card    2020-10-15  
1      0.03     New Jamesside       Wallet    2022-03-15  
2      0.01      Lake Roberto       Wallet    2021-08-07  
3      0.15  West Melanieview       Wallet    2020-08-08  
4      0.17         Mariastad         Cash    2022-11-15  

Dask - Last 5 rows:
     OrderID     CustomerName ProductCategory  Amount   OrderDate  \
495     2930     Jaime Harris         Fashion  6

2. Selection, Renaming, and Filtering

In [22]:
from pyspark.sql.functions import col

sdf_selected = sdf.select("OrderID", "CustomerName", "Amount")

sdf_renamed = sdf_selected.withColumnRenamed("Amount", "OrderAmount")

sdf_filtered_amount = sdf.filter(col("Amount") > 500)

sdf_filtered_city = sdf.filter(col("City") == "Lake Roberto")

In [23]:
sdf_renamed.show(5)
sdf_filtered_amount.show(5)
sdf_filtered_city.show(5)

+-------+--------------+-----------+
|OrderID|  CustomerName|OrderAmount|
+-------+--------------+-----------+
|   2824| Donald Walker|     783.04|
|   7912|  Brandon Hall|      905.0|
|   4611|  Donald Booth|     657.96|
|   3547|Phillip Garcia|     606.89|
|   8527|  Valerie Gray|      77.87|
+-------+--------------+-----------+
only showing top 5 rows

+-------+------------------+---------------+------+----------+--------------+--------+----------------+-----------+-----------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|      FinalAmount|
+-------+------------------+---------------+------+----------+--------------+--------+----------------+-----------+-----------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|          665.584|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|   

3. Data Manipulation

In [20]:
from pyspark.sql.functions import col, expr

sdf = sdf.drop("CustomerSince")

sdf = sdf.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))

sdf_sorted = sdf.orderBy(col("FinalAmount").desc())

sdf_cleaned = sdf_sorted.withColumn(
    "DeliveryStatus",
    expr("CASE WHEN DeliveryStatus = 'Cancelled' THEN 'Order Cancelled' ELSE DeliveryStatus END")
)

In [21]:
sdf_cleaned.select("OrderID", "Amount", "Discount", "FinalAmount", "DeliveryStatus").show(5)

+-------+------+--------+-----------------+---------------+
|OrderID|Amount|Discount|      FinalAmount| DeliveryStatus|
+-------+------+--------+-----------------+---------------+
|   5573|981.05|    0.02|          961.429|Order Cancelled|
|   8474|968.91|    0.02|         949.5318|Order Cancelled|
|   8889| 998.3|    0.06|938.4019999999999|Order Cancelled|
|   2127|933.32|    0.01|         923.9868|       Returned|
|   9806|993.17|    0.07|         923.6481|Order Cancelled|
+-------+------+--------+-----------------+---------------+
only showing top 5 rows



4. Aggregations and GroupBy

In [18]:
from pyspark.sql.functions import avg, sum, count

delivery_status_counts = sdf_cleaned.groupBy("DeliveryStatus").agg(count("*").alias("OrderCount"))

avg_amount_by_category = sdf_cleaned.groupBy("ProductCategory").agg(avg("Amount").alias("AvgAmount"))

total_sales_by_city = sdf_cleaned.groupBy("City").agg(sum("Amount").alias("TotalSales"))

In [19]:
delivery_status_counts.show()
avg_amount_by_category.show()
total_sales_by_city.show()

+---------------+----------+
| DeliveryStatus|OrderCount|
+---------------+----------+
|       Returned|       117|
|      Delivered|       119|
|Order Cancelled|       149|
|        Pending|       115|
+---------------+----------+

+---------------+------------------+
|ProductCategory|         AvgAmount|
+---------------+------------------+
|        Fashion|500.63082352941205|
|      Groceries|459.51786407767014|
|    Electronics| 551.7450000000002|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+

+----------------+----------+
|            City|TotalSales|
+----------------+----------+
|     Ramseymouth|    761.06|
|East Edwardshire|    291.26|
|    Lake Douglas|    975.09|
|      Thomasberg|    882.68|
| South Colinstad|    786.27|
|     Laurenville|    383.26|
|        Seanbury|    814.39|
|      Gordonport|    514.99|
|  West Dawnmouth|      12.8|
|   Williamsmouth|     10.78|
|     Sheilaville|    981.05|
|       Molly

5. Null Handling & Update

In [13]:
from pyspark.sql.functions import when
import random

sdf_with_nulls = sdf_cleaned.withColumn(
    "City",
    when((col("City").isNotNull()) & (col("Amount") % 10 == 0), None).otherwise(col("City"))
)


In [14]:
sdf_fillna = sdf_with_nulls.fillna({"City": "Unknown"})

sdf_dropna = sdf_with_nulls.dropna(subset=["City"])


In [15]:
from pyspark.sql.functions import when

sdf_tagged = sdf_fillna.withColumn(
    "CustomerType",
    when(col("Amount") > 800, "High").otherwise("Regular")
)


In [16]:
sdf_tagged.select("OrderID", "Amount", "City", "CustomerType").show(10)


+-------+------+--------------+------------+
|OrderID|Amount|          City|CustomerType|
+-------+------+--------------+------------+
|   5573|981.05|   Sheilaville|        High|
|   8474|968.91|    Riverafort|        High|
|   8889| 998.3|    Johnsonton|        High|
|   2127|933.32|   Cherylhaven|        High|
|   9806|993.17|  New Seanstad|        High|
|   5593|961.35| Alexisborough|        High|
|   2120|948.84|  Jenniferberg|        High|
|   5949|918.14|East Scottfort|        High|
|   1422| 973.2|    Alfredview|        High|
|   2904|922.29|   Stokesmouth|        High|
+-------+------+--------------+------------+
only showing top 10 rows



6. Date & Time Functions

In [25]:
sdf_original = spark.read.csv(file_path, header=True, inferSchema=True)

from pyspark.sql.functions import to_date, year, month, current_date, months_between, round, col

sdf_dates = sdf_original.withColumn("OrderDate", to_date(col("OrderDate"), "yyyy-MM-dd")) \
                        .withColumn("CustomerSince", to_date(col("CustomerSince"), "yyyy-MM-dd"))

sdf_dates = sdf_dates.withColumn("OrderYear", year(col("OrderDate"))) \
                     .withColumn("OrderMonth", month(col("OrderDate")))

sdf_dates = sdf_dates.withColumn(
    "LoyaltyYears",
    round(months_between(current_date(), col("CustomerSince")) / 12, 1)
)

In [26]:
sdf_dates.select("CustomerName", "OrderDate", "OrderYear", "OrderMonth", "CustomerSince", "LoyaltyYears").show(10)

+------------------+----------+---------+----------+-------------+------------+
|      CustomerName| OrderDate|OrderYear|OrderMonth|CustomerSince|LoyaltyYears|
+------------------+----------+---------+----------+-------------+------------+
|     Donald Walker|2024-12-26|     2024|        12|   2020-10-15|         4.6|
|      Brandon Hall|2024-09-12|     2024|         9|   2022-03-15|         3.2|
|      Donald Booth|2025-01-12|     2025|         1|   2021-08-07|         3.8|
|    Phillip Garcia|2024-03-24|     2024|         3|   2020-08-08|         4.8|
|      Valerie Gray|2024-08-04|     2024|         8|   2022-11-15|         2.6|
|       Amber Perez|2024-01-13|     2024|         1|   2022-01-13|         3.4|
|        Roy Martin|2024-03-04|     2024|         3|   2023-04-29|         2.1|
|    Carolyn Daniel|2023-10-07|     2023|        10|   2021-05-09|         4.1|
|       Patty Perez|2023-06-27|     2023|         6|   2021-04-25|         4.1|
|Jonathan Wilkerson|2024-10-14|     2024

7. Joins and Unions

In [27]:
from pyspark.sql import Row

city_region_data = [
    Row(City="New York", Region="East"),
    Row(City="Los Angeles", Region="West"),
    Row(City="Chicago", Region="Midwest"),
    Row(City="Houston", Region="South"),
    Row(City="Phoenix", Region="West")
]

city_region_df = spark.createDataFrame(city_region_data)
city_region_df.show()


+-----------+-------+
|       City| Region|
+-----------+-------+
|   New York|   East|
|Los Angeles|   West|
|    Chicago|Midwest|
|    Houston|  South|
|    Phoenix|   West|
+-----------+-------+



In [28]:
inner_join_df = sdf_original.join(city_region_df, on="City", how="inner")
print("Inner Join Result:")
inner_join_df.show(5)

left_join_df = sdf_original.join(city_region_df, on="City", how="left")
print("Left Join Result:")
left_join_df.show(5)


Inner Join Result:
+----+-------+------------+---------------+------+---------+--------------+--------+-----------+-------------+------+
|City|OrderID|CustomerName|ProductCategory|Amount|OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|Region|
+----+-------+------------+---------------+------+---------+--------------+--------+-----------+-------------+------+
+----+-------+------------+---------------+------+---------+--------------+--------+-----------+-------------+------+

Left Join Result:
+----------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
|            City|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|Region|
+----------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------+
| Port Jesseville|   4150|   Amber Perez|          Books|352.37|2024-01-13|     Cancelled| 

In [29]:
from pyspark.sql.functions import year

sdf_dates = sdf_original.withColumn("OrderDate", to_date(col("OrderDate"), "yyyy-MM-dd"))

orders_2023 = sdf_dates.filter(year(col("OrderDate")) == 2023)
orders_2024 = sdf_dates.filter(year(col("OrderDate")) == 2024)

union_df = orders_2023.union(orders_2024)

print("Union of 2023 and 2024 orders:")
union_df.show(5)


Union of 2023 and 2024 orders:
+-------+--------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|   2169|Carolyn Daniel|    Electronics| 14.09|2023-10-07|     Delivered|    0.25|         Grayside|Credit Card|   2021-05-09|
|   6313|   Patty Perez|      Groceries| 79.83|2023-06-27|     Cancelled|    0.12|      Richardland|Credit Card|   2021-04-25|
|   2040| Kyle Mcdonald|           Toys|327.52|2023-12-15|      Returned|    0.06|Lake Jenniferside|     Wallet|   2021-07-21|
|   6038| David Bradley|        Fashion|348.51|2023-08-03|      Returned|    0.23|    Lake Toddland|        UPI|   2022-09-07|
|   3060|   John Pierce|           Toys|362.09|2023-12-25|      Returned|    0.0

8. Complex JSON Simulation (Advanced)


In [30]:
from pyspark.sql.functions import to_json, struct

json_df = sdf_original.withColumn(
    "json_str",
    to_json(struct([sdf_original[col] for col in sdf_original.columns]))
)

json_df.select("OrderID", "json_str").show(3, truncate=False)


+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|OrderID|json_str                                                                                                                                                                                                                                     |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|2824   |{"OrderID":2824,"CustomerName":"Donald Walker","ProductCategory":"Books","Amount":783.04,"OrderDate":"2024-12-26","DeliveryStatus":"Returned","Discount":0.15,"City":"Lake Joyside","PaymentMode":"Credit Card","CustomerSince":"2020-10-15"}|
|7912   

In [31]:
json_strings = json_df.select("json_str")

json_loaded_df = spark.read.json(json_strings.rdd.map(lambda r: r[0]))

json_loaded_df.printSchema()
json_loaded_df.show(3, truncate=False)


root
 |-- Amount: double (nullable = true)
 |-- City: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- CustomerSince: string (nullable = true)
 |-- DeliveryStatus: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- OrderDate: string (nullable = true)
 |-- OrderID: long (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- ProductCategory: string (nullable = true)

+------+-------------+-------------+-------------+--------------+--------+----------+-------+-----------+---------------+
|Amount|City         |CustomerName |CustomerSince|DeliveryStatus|Discount|OrderDate |OrderID|PaymentMode|ProductCategory|
+------+-------------+-------------+-------------+--------------+--------+----------+-------+-----------+---------------+
|783.04|Lake Joyside |Donald Walker|2020-10-15   |Returned      |0.15    |2024-12-26|2824   |Credit Card|Books          |
|905.0 |New Jamesside|Brandon Hall |2022-03-15   |Cancelled     |0.03    |2024-09-12|7912

In [32]:
from pyspark.sql.functions import get_json_object

json_df.select(
    "OrderID",
    get_json_object("json_str", "$.ProductCategory").alias("ProductCategory"),
    get_json_object("json_str", "$.DeliveryStatus").alias("DeliveryStatus")
).show(5)


+-------+---------------+--------------+
|OrderID|ProductCategory|DeliveryStatus|
+-------+---------------+--------------+
|   2824|          Books|      Returned|
|   7912|      Groceries|     Cancelled|
|   4611|        Fashion|      Returned|
|   3547|        Fashion|      Returned|
|   8527|           Toys|     Delivered|
+-------+---------------+--------------+
only showing top 5 rows



9. Applying Functions

In [35]:
import pandas as pd

pandas_df = pd.read_csv('/content/drive/MyDrive/Sales_Dataset__500_Records_.csv')

def tag_order(amount):
    if amount > 1000:
        return "Big"
    elif amount > 500:
        return "Medium"
    else:
        return "Small"

pandas_df['OrderTag'] = pandas_df['Amount'].apply(tag_order)

pandas_df[['OrderID', 'Amount', 'OrderTag']].head(10)


Unnamed: 0,OrderID,Amount,OrderTag
0,2824,783.04,Medium
1,7912,905.0,Medium
2,4611,657.96,Medium
3,3547,606.89,Medium
4,8527,77.87,Small
5,4150,352.37,Small
6,5554,148.33,Small
7,2169,14.09,Small
8,6313,79.83,Small
9,6155,882.68,Medium


In [36]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def tag_order(amount):
    if amount is None:
        return "Unknown"
    if amount > 1000:
        return "Big"
    elif amount > 500:
        return "Medium"
    else:
        return "Small"

tag_order_udf = udf(tag_order, StringType())

sdf_tagged = sdf_original.withColumn("OrderTag", tag_order_udf("Amount"))

sdf_tagged.select("OrderID", "Amount", "OrderTag").show(10)


+-------+------+--------+
|OrderID|Amount|OrderTag|
+-------+------+--------+
|   2824|783.04|  Medium|
|   7912| 905.0|  Medium|
|   4611|657.96|  Medium|
|   3547|606.89|  Medium|
|   8527| 77.87|   Small|
|   4150|352.37|   Small|
|   5554|148.33|   Small|
|   2169| 14.09|   Small|
|   6313| 79.83|   Small|
|   6155|882.68|  Medium|
+-------+------+--------+
only showing top 10 rows

