## Dates and Timestamp

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('date and time').getOrCreate()

In [2]:
# Example CSV with date and timestamp columns
data = [("2025-01-01", "2025-01-01 15:30:00"),
       ("2025-01-02", "2025-01-02 10:00:00")]
columns = ["date", "timestamp"]

df = spark.createDataFrame(data, columns)
df.show()
df.printSchema()

+----------+-------------------+
|      date|          timestamp|
+----------+-------------------+
|2025-01-01|2025-01-01 15:30:00|
|2025-01-02|2025-01-02 10:00:00|
+----------+-------------------+

root
 |-- date: string (nullable = true)
 |-- timestamp: string (nullable = true)



Convert date to date and timestamp to timestamp

In [3]:
from pyspark.sql.functions import to_date, to_timestamp

df = df.withColumn("date", to_date("date")) \
      .withColumn("timestamp", to_timestamp("timestamp"))
df.printSchema()

root
 |-- date: date (nullable = true)
 |-- timestamp: timestamp (nullable = true)



Extract Parts of Dates and Timestamps

In [4]:
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second

df.select(
   year("date").alias("Year"),
   month("date").alias("Month"),
   dayofmonth("date").alias("Day"),
   hour("timestamp").alias("Hour"),
   minute("timestamp").alias("Minute"),
   second("timestamp").alias("Second")
).show()

+----+-----+---+----+------+------+
|Year|Month|Day|Hour|Minute|Second|
+----+-----+---+----+------+------+
|2025|    1|  1|  15|    30|     0|
|2025|    1|  2|  10|     0|     0|
+----+-----+---+----+------+------+



Filtering and Comparing Dates

In [5]:
from pyspark.sql.functions import lit

# Filter rows where date is after 2025-01-01
df.filter(df["date"] > lit("2025-01-01")).show()

+----------+-------------------+
|      date|          timestamp|
+----------+-------------------+
|2025-01-02|2025-01-02 10:00:00|
+----------+-------------------+



Formatting Dates and Timestamp

In [6]:
from pyspark.sql.functions import date_format

df.select(
   date_format("date", "yyyy/MM/dd").alias("FormattedDate"),
   date_format("timestamp", "HH:mm:ss").alias("FormattedTime")
).show()

+-------------+-------------+
|FormattedDate|FormattedTime|
+-------------+-------------+
|   2025/01/01|     15:30:00|
|   2025/01/02|     10:00:00|
+-------------+-------------+



Adding or subtracting days

In [7]:
from pyspark.sql.functions import date_add, date_sub

df.select(
   "date",
   date_add("date", 10).alias("DatePlus10Days"),
   date_sub("date", 10).alias("DateMinus10Days")
).show()

+----------+--------------+---------------+
|      date|DatePlus10Days|DateMinus10Days|
+----------+--------------+---------------+
|2025-01-01|    2025-01-11|     2024-12-22|
|2025-01-02|    2025-01-12|     2024-12-23|
+----------+--------------+---------------+



Handling Time Intervals

In [8]:
from pyspark.sql.functions import current_date, datediff

df.withColumn("DaysSince", datediff(current_date(), "date")).show()

+----------+-------------------+---------+
|      date|          timestamp|DaysSince|
+----------+-------------------+---------+
|2025-01-01|2025-01-01 15:30:00|      101|
|2025-01-02|2025-01-02 10:00:00|      100|
+----------+-------------------+---------+



Using SQL syntax for date

In [9]:
df.createOrReplaceTempView("dates")
spark.sql("SELECT date, YEAR(date) as Year FROM dates").show()

+----------+----+
|      date|Year|
+----------+----+
|2025-01-01|2025|
|2025-01-02|2025|
+----------+----+



### Handle Missing Date Data

Create a DataFrame with missing dates

In [10]:
# Create a sample DataFrame with missing dates
data = [("John", None), ("Sarah", "2025-01-01"), ("Mike", None)]
columns = ["Name", "Date"]
df = spark.createDataFrame(data, columns)

# Show the original DataFrame
print("Original DataFrame:")
df.show()
df.printSchema()

Original DataFrame:
+-----+----------+
| Name|      Date|
+-----+----------+
| John|      NULL|
|Sarah|2025-01-01|
| Mike|      NULL|
+-----+----------+

root
 |-- Name: string (nullable = true)
 |-- Date: string (nullable = true)



Convert string to Date

In [11]:
from pyspark.sql.functions import to_date

# Convert the Date column to DateType
df = df.withColumn("Date", to_date(df["Date"]))

print("DataFrame After Casting Date Column:")
df.show()
df.printSchema()

DataFrame After Casting Date Column:
+-----+----------+
| Name|      Date|
+-----+----------+
| John|      NULL|
|Sarah|2025-01-01|
| Mike|      NULL|
+-----+----------+

root
 |-- Name: string (nullable = true)
 |-- Date: date (nullable = true)



Fill NULL with today date

In [12]:
from pyspark.sql.functions import current_date, when, to_date
# Fill missing dates with today's date using `when`
df_filled = df.withColumn(
   "Date",
   when(df["Date"].isNull(), current_date()).otherwise(df["Date"])
)

# Show the updated DataFrame
df_filled.show()

+-----+----------+
| Name|      Date|
+-----+----------+
| John|2025-04-12|
|Sarah|2025-01-01|
| Mike|2025-04-12|
+-----+----------+

