In [0]:
 /FileStore/tables/train.csv

In [0]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("Sales Dataset").getOrCreate()

# Load the CSV data
data = "/FileStore/tables/train.csv"
df = spark.read.format("csv").option("header", "true").option("inferschema", "true").load(data)

# Register DataFrame as a SQL temporary view
df.createOrReplaceTempView("sales_data")

# SQL to create a new temporary view with the 'Order Month' column and selected important columns
spark.sql("""
    CREATE OR REPLACE TEMP VIEW sales_data_with_month AS
    SELECT `Order ID`, `Order Date`, `Ship Date`, `Ship Mode`, MONTH(`Order Date`) AS `Order Month`
    FROM sales_data
""")

# List of important columns to keep
important_columns = ["Order ID", "Order Date", "Ship Date", "Ship Mode"]

# Execute SQL queries for each month, focusing only on important columns
for month_num in range(1, 13):
    # SQL query to filter data for each month and select only important columns
    sql_query = f"""
        SELECT `Order ID`, `Order Date`, `Ship Date`, `Ship Mode`
        FROM sales_data_with_month
        WHERE `Order Month` = {month_num}
    """
    
    # Execute the query and display the data
    print(f"Displaying data for Month: {month_num}")
    month_df = spark.sql(sql_query)
    month_df.show(truncate=False)
    
    # Optionally, save each partition to a new CSV file
    month_df.write.csv(f'sales_data_2024_month_{month_num}_important.csv', header=True)


Displaying data for Month: 1
+--------------+----------+----------+--------------+
|Order ID      |Order Date|Ship Date |Ship Mode     |
+--------------+----------+----------+--------------+
|CA-2017-121755|2017-01-16|2017-01-20|Second Class  |
|CA-2017-121755|2017-01-16|2017-01-20|Second Class  |
|CA-2016-149587|2016-01-31|2016-02-05|Second Class  |
|CA-2016-149587|2016-01-31|2016-02-05|Second Class  |
|CA-2016-149587|2016-01-31|2016-02-05|Second Class  |
|CA-2016-146262|2016-01-02|2016-01-09|Standard Class|
|CA-2016-146262|2016-01-02|2016-01-09|Standard Class|
|CA-2016-146262|2016-01-02|2016-01-09|Standard Class|
|CA-2016-146262|2016-01-02|2016-01-09|Standard Class|
|CA-2016-146262|2016-01-02|2016-01-09|Standard Class|
|CA-2017-111010|2017-01-22|2017-01-28|Standard Class|
|CA-2018-157252|2018-01-20|2018-01-23|Second Class  |
|US-2016-101399|2016-01-17|2016-01-24|Standard Class|
|CA-2018-127432|2018-01-22|2018-01-27|Standard Class|
|CA-2018-127432|2018-01-22|2018-01-27|Standard Class|