In [0]:
%run "../utils/mount_configuration"

In [0]:
tickets_price_df = spark.read.format("delta") \
                            .load(f"{processed_folder_path}/tickers_price")

In [0]:
display(tickets_price_df)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 1. Adding 'day_of_week' and 'year' columns
tickets_price_df = tickets_price_df.withColumn("day_of_week", F.date_format("date", "E")) \
                                   .withColumn("year", F.year("date")) \
                                   .withColumn("week_of_year", F.weekofyear("date"))

# 2. Creating 'priority' column (Monday -> 1, Tuesday -> 2, etc.)
tickets_price_df = tickets_price_df.withColumn(
    "priority",
    F.when(F.col("day_of_week") == "Mon", 1)
     .when(F.col("day_of_week") == "Tue", 2)
     .when(F.col("day_of_week") == "Wed", 3)
     .when(F.col("day_of_week") == "Thu", 4)
     .when(F.col("day_of_week") == "Fri", 5)
     .when(F.col("day_of_week") == "Sat", 6)
     .when(F.col("day_of_week") == "Sun", 7)
)

# 3. Creating a window for sorting within combinations of 'year' and 'week_of_year'
window_spec = Window.partitionBy("year", "week_of_year").orderBy("priority", "date")

# 4. Assigning rank for each row within each week
tickets_price_df = tickets_price_df.withColumn("rank", F.row_number().over(window_spec))

# 5. Filtering to keep only the earliest day in each week (rank = 1)
result_df = tickets_price_df.filter(F.col("rank") == 1)

# 6. Selecting only 'year', 'week_of_year', 'date' columns
final_result = result_df.select("year", "week_of_year", "date")

# Displaying the result
display(final_result)

In [0]:
final_result.write.mode("overwrite") \
                    .format("json") \
                    .saveAsTable("engineering_presentation.ticker_details_dates")

In [0]:
%sql
SELECT * FROM engineering_presentation.ticker_details_dates;