In [0]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("Date Function")\
        .getOrCreate()

In [0]:
data=[["1","2020-02-01"],["2","2019-03-01"],["3","2021-04-05"]]
df=spark.createDataFrame(data,["id","input"])
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)



#### Date Methods

##### current_date()

In [0]:
#current date returns the current date
from pyspark.sql.functions import current_date
df.select(current_date().alias("currentDate")).show()

+-----------+
|currentDate|
+-----------+
| 2024-03-30|
| 2024-03-30|
| 2024-03-30|
+-----------+



##### to_date()

In [0]:
# to_date converts the string type to date type
from pyspark.sql.functions import to_date,col
df1=df.select(col("input"),
              to_date(col("input"),"dd-MM-yyyy").alias("to_date"))
df1.show()

+----------+-------+
|     input|to_date|
+----------+-------+
|2020-02-01|   null|
|2019-03-01|   null|
|2021-04-05|   null|
+----------+-------+



In [0]:
df1.printSchema()

root
 |-- input: string (nullable = true)
 |-- to_date: date (nullable = true)



In [0]:
df.withColumn("new_input",to_date("input")).printSchema()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)
 |-- new_input: date (nullable = true)



##### dayofweek()

In [0]:
from pyspark.sql.functions import dayofweek
df2=df.select(col("input"),dayofweek("input").alias("Day_of_week"))
df2.printSchema()
df2.show()

root
 |-- input: string (nullable = true)
 |-- Day_of_week: integer (nullable = true)

+----------+-----------+
|     input|Day_of_week|
+----------+-----------+
|2020-02-01|          7|
|2019-03-01|          6|
|2021-04-05|          2|
+----------+-----------+



##### dayofmonth

In [0]:
from pyspark.sql.functions import dayofmonth
df.select(col("input"),dayofmonth("input").alias("Day_of_Month")).show()

+----------+------------+
|     input|Day_of_Month|
+----------+------------+
|2020-02-01|           1|
|2019-03-01|           1|
|2021-04-05|           5|
+----------+------------+



##### dayofyear

In [0]:
from pyspark.sql.functions import dayofyear
df.withColumn("Day_Of_Year",dayofyear("input")).show()

+---+----------+-----------+
| id|     input|Day_Of_Year|
+---+----------+-----------+
|  1|2020-02-01|         32|
|  2|2019-03-01|         60|
|  3|2021-04-05|         95|
+---+----------+-----------+



##### weekofyear

In [0]:
from pyspark.sql.functions import weekofyear
df.withColumn("Week_of_year",weekofyear("input")).show()

+---+----------+------------+
| id|     input|Week_of_year|
+---+----------+------------+
|  1|2020-02-01|           5|
|  2|2019-03-01|           9|
|  3|2021-04-05|          14|
+---+----------+------------+



##### yer,month,quarter

In [0]:
# quarter returns the quarter number(upto 3 months - 1st quarter, 4 to 6 months - 2nd quarter, ...)
from pyspark.sql.functions import year, month, quarter
df.withColumn("Year", year("input"))\
    .withColumn("Month", month("input"))\
    .withColumn("Quarter", quarter("input"))\
    .show()

+---+----------+----+-----+-------+
| id|     input|Year|Month|Quarter|
+---+----------+----+-----+-------+
|  1|2020-02-01|2020|    2|      1|
|  2|2019-03-01|2019|    3|      1|
|  3|2021-04-05|2021|    4|      2|
+---+----------+----+-----+-------+



##### last_day()

In [0]:
from pyspark.sql.functions import last_day
df.withColumn("lastDay",last_day("input")).show()

+---+----------+----------+
| id|     input|   lastDay|
+---+----------+----------+
|  1|2020-02-01|2020-02-29|
|  2|2019-03-01|2019-03-31|
|  3|2021-04-05|2021-04-30|
+---+----------+----------+



##### next_day()

In [0]:
from pyspark.sql.functions import next_day
df.withColumn("nextDay",next_day("input","Monday")).show()

+---+----------+----------+
| id|     input|   nextDay|
+---+----------+----------+
|  1|2020-02-01|2020-02-03|
|  2|2019-03-01|2019-03-04|
|  3|2021-04-05|2021-04-12|
+---+----------+----------+



##### add_months,date_add,date_sub

In [0]:
from pyspark.sql.functions import add_months, date_add, date_sub
df.withColumn("add_month", add_months("input", 3))\
    .withColumn("add_month_with-", add_months("input", -3))\
    .withColumn("date_add", date_add("input", 4)) \
    .withColumn("date_sub", date_sub("input", 2)) \
    .show()

+---+----------+----------+---------------+----------+----------+
| id|     input| add_month|add_month_with-|  date_add|  date_sub|
+---+----------+----------+---------------+----------+----------+
|  1|2020-02-01|2020-05-01|     2019-11-01|2020-02-05|2020-01-30|
|  2|2019-03-01|2019-06-01|     2018-12-01|2019-03-05|2019-02-27|
|  3|2021-04-05|2021-07-05|     2021-01-05|2021-04-09|2021-04-03|
+---+----------+----------+---------------+----------+----------+



##### datediff

In [0]:
from pyspark.sql.functions import datediff
df.select(col("input"),\
    datediff(current_date(),col("input")).alias("date_difference")).show()

+----------+---------------+
|     input|date_difference|
+----------+---------------+
|2020-02-01|           1519|
|2019-03-01|           1856|
|2021-04-05|           1090|
+----------+---------------+



##### months_between

In [0]:
from pyspark.sql.functions import months_between
df.withColumn("months_between", months_between(current_date(), col("input"), False)).show()
df.withColumn("months_between", months_between(current_date(), col("input"), True)).show()

+---+----------+------------------+
| id|     input|    months_between|
+---+----------+------------------+
|  1|2020-02-01|49.935483870967744|
|  2|2019-03-01|60.935483870967744|
|  3|2021-04-05|35.806451612903224|
+---+----------+------------------+

+---+----------+--------------+
| id|     input|months_between|
+---+----------+--------------+
|  1|2020-02-01|   49.93548387|
|  2|2019-03-01|   60.93548387|
|  3|2021-04-05|   35.80645161|
+---+----------+--------------+



##### trunc

In [0]:
from pyspark.sql.functions import trunc
df.withColumn("truncatedYear",trunc("input","Year"))\
    .withColumn("truncatedMonth",trunc("input","Month"))\
        .withColumn("trunctedDay",trunc("input","Week"))\
            .show()

+---+----------+-------------+--------------+-----------+
| id|     input|truncatedYear|truncatedMonth|trunctedDay|
+---+----------+-------------+--------------+-----------+
|  1|2020-02-01|   2020-01-01|    2020-02-01| 2020-01-27|
|  2|2019-03-01|   2019-01-01|    2019-03-01| 2019-02-25|
|  3|2021-04-05|   2021-01-01|    2021-04-01| 2021-04-05|
+---+----------+-------------+--------------+-----------+



##### date_trunc

In [0]:
from pyspark.sql.functions import date_trunc
df.withColumn("truncatedYear",date_trunc("Year","input"))\
    .withColumn("truncatedMonth",date_trunc("Month","input"))\
        .withColumn("truncatedDay",date_trunc("Day","input"))\
            .withColumn("truncatedQurter",date_trunc("Quarter","input"))\
            .show()

+---+----------+-------------------+-------------------+-------------------+-------------------+
| id|     input|      truncatedYear|     truncatedMonth|       truncatedDay|    truncatedQurter|
+---+----------+-------------------+-------------------+-------------------+-------------------+
|  1|2020-02-01|2020-01-01 00:00:00|2020-02-01 00:00:00|2020-02-01 00:00:00|2020-01-01 00:00:00|
|  2|2019-03-01|2019-01-01 00:00:00|2019-03-01 00:00:00|2019-03-01 00:00:00|2019-01-01 00:00:00|
|  3|2021-04-05|2021-01-01 00:00:00|2021-04-01 00:00:00|2021-04-05 00:00:00|2021-04-01 00:00:00|
+---+----------+-------------------+-------------------+-------------------+-------------------+



In [0]:
## Remaining Most possible values : ‘second’, ‘minute’, ‘hour’, ‘week’
from pyspark.sql.functions import date_trunc
df.withColumn("truncatedWeek", date_trunc("week", "input"))\
    .withColumn("truncatedHour", date_trunc("hour", "input"))\
    .withColumn("truncatedMinute", date_trunc("minute", "input"))\
    .withColumn("truncatedSecond", date_trunc("second", "input"))\
    .show()

+---+----------+-------------------+-------------------+-------------------+-------------------+
| id|     input|      truncatedWeek|      truncatedHour|    truncatedMinute|    truncatedSecond|
+---+----------+-------------------+-------------------+-------------------+-------------------+
|  1|2020-02-01|2020-01-27 00:00:00|2020-02-01 00:00:00|2020-02-01 00:00:00|2020-02-01 00:00:00|
|  2|2019-03-01|2019-02-25 00:00:00|2019-03-01 00:00:00|2019-03-01 00:00:00|2019-03-01 00:00:00|
|  3|2021-04-05|2021-04-05 00:00:00|2021-04-05 00:00:00|2021-04-05 00:00:00|2021-04-05 00:00:00|
+---+----------+-------------------+-------------------+-------------------+-------------------+



##### form_unixtime

In [0]:
from pyspark.sql.functions import from_unixtime

data = [(1612345678,),
        (1623456789,),
        (1634567890,)]
columns = ["unix_timestamp"]
df2 = spark.createDataFrame(data, columns)

# Using from_unixtime to convert Unix timestamps to timestamps
df2=df2.withColumn("timestamp",from_unixtime("unix_timestamp"))
df2.show()


+--------------+-------------------+
|unix_timestamp|          timestamp|
+--------------+-------------------+
|    1612345678|2021-02-03 09:47:58|
|    1623456789|2021-06-12 00:13:09|
|    1634567890|2021-10-18 14:38:10|
+--------------+-------------------+



##### unix_timestamp

In [0]:
from pyspark.sql.functions import unix_timestamp
df.withColumn('timestamp',unix_timestamp("input")).show()

+---+----------+---------+
| id|     input|timestamp|
+---+----------+---------+
|  1|2020-02-01|     null|
|  2|2019-03-01|     null|
|  3|2021-04-05|     null|
+---+----------+---------+



#### Time Stamp Methods

In [0]:
data=[["1","2020-02-01 11:01:19.06"],["2","2019-03-01 12:01:19.406"],["3","2021-03-01 12:01:19.406"]]
df3=spark.createDataFrame(data,["id","input"])

In [0]:
data=[["1","02-01-2020 11 01 19 06"],["2","03-01-2019 12 01 19 406"],["3","03-01-2021 12 01 19 406"]]
df2=spark.createDataFrame(data,["id","input"])

##### current_timestamp

In [0]:
from pyspark.sql.functions import current_timestamp
df2.withColumn("Current_Time", current_timestamp()).show(truncate = False)

+---+-----------------------+-----------------------+
|id |input                  |Current_Time           |
+---+-----------------------+-----------------------+
|1  |02-01-2020 11 01 19 06 |2024-03-30 15:28:58.862|
|2  |03-01-2019 12 01 19 406|2024-03-30 15:28:58.862|
|3  |03-01-2021 12 01 19 406|2024-03-30 15:28:58.862|
+---+-----------------------+-----------------------+



##### to_timestamp

In [0]:
from pyspark.sql.functions import to_timestamp
df2.withColumn("Coss_TS",to_timestamp("input", "dd-MM-yyyy HH mm ss SSS")).show(truncate = False)

+---+-----------------------+-----------------------+
|id |input                  |Coss_TS                |
+---+-----------------------+-----------------------+
|1  |02-01-2020 11 01 19 06 |2020-01-02 11:01:19.06 |
|2  |03-01-2019 12 01 19 406|2019-01-03 12:01:19.406|
|3  |03-01-2021 12 01 19 406|2021-01-03 12:01:19.406|
+---+-----------------------+-----------------------+



##### hour

In [0]:
from pyspark.sql.functions import hour
df3.withColumn("hour",hour("input")).show()

+---+--------------------+----+
| id|               input|hour|
+---+--------------------+----+
|  1|2020-02-01 11:01:...|  11|
|  2|2019-03-01 12:01:...|  12|
|  3|2021-03-01 12:01:...|  12|
+---+--------------------+----+



##### minutes

In [0]:
from pyspark.sql.functions import minute
df3.withColumn("min",minute("input")).show(truncate=False)

+---+-----------------------+---+
|id |input                  |min|
+---+-----------------------+---+
|1  |2020-02-01 11:01:19.06 |1  |
|2  |2019-03-01 12:01:19.406|1  |
|3  |2021-03-01 12:01:19.406|1  |
+---+-----------------------+---+



##### seconds

In [0]:
from pyspark.sql.functions import second
df3.withColumn("sec",second("input")).show(truncate=False)

+---+-----------------------+---+
|id |input                  |sec|
+---+-----------------------+---+
|1  |2020-02-01 11:01:19.06 |19 |
|2  |2019-03-01 12:01:19.406|19 |
|3  |2021-03-01 12:01:19.406|19 |
+---+-----------------------+---+



#### Date Format

In [0]:
data = [("2022-01-15 08:30:45",),
        ("2022-02-20 12:15:30",),
        ("2022-03-25 18:45:15",)]
columns = ["timestamp"]
df4 = spark.createDataFrame(data, columns)

# Convert the string timestamp to a timestamp type
df4 = df4.withColumn("timestamp", col("timestamp").cast("timestamp"))

In [0]:
from pyspark.sql.functions import date_format
df4.withColumn("new_format", date_format("timestamp", "dd-MM-yyyy")).show()

+-------------------+----------+
|          timestamp|new_format|
+-------------------+----------+
|2022-01-15 08:30:45|15-01-2022|
|2022-02-20 12:15:30|20-02-2022|
|2022-03-25 18:45:15|25-03-2022|
+-------------------+----------+



In [0]:
df4.withColumn("year", date_format("timestamp", "yyyy")).show()

+-------------------+----+
|          timestamp|year|
+-------------------+----+
|2022-01-15 08:30:45|2022|
|2022-02-20 12:15:30|2022|
|2022-03-25 18:45:15|2022|
+-------------------+----+



In [0]:
df4.withColumn("month",date_format("timestamp","MM")).show()

+-------------------+-----+
|          timestamp|month|
+-------------------+-----+
|2022-01-15 08:30:45|   01|
|2022-02-20 12:15:30|   02|
|2022-03-25 18:45:15|   03|
+-------------------+-----+



In [0]:
df4.withColumn("short_month",date_format("timestamp","MMM")).show()

+-------------------+-----------+
|          timestamp|short_month|
+-------------------+-----------+
|2022-01-15 08:30:45|        Jan|
|2022-02-20 12:15:30|        Feb|
|2022-03-25 18:45:15|        Mar|
+-------------------+-----------+



In [0]:
df4.withColumn("full_month",date_format("timestamp","MMMM")).show()

+-------------------+----------+
|          timestamp|full_month|
+-------------------+----------+
|2022-01-15 08:30:45|   January|
|2022-02-20 12:15:30|  February|
|2022-03-25 18:45:15|     March|
+-------------------+----------+



In [0]:
df4.withColumn("am_pm",date_format("timestamp",'a')).show()

+-------------------+-----+
|          timestamp|am_pm|
+-------------------+-----+
|2022-01-15 08:30:45|   AM|
|2022-02-20 12:15:30|   PM|
|2022-03-25 18:45:15|   PM|
+-------------------+-----+



#### Timestamp Format

In [0]:
df4 = df4.withColumn("timestamp", col("timestamp").cast("timestamp"))

# a in the format explains it if it is AM or PM by converting the 24h into 12h
df4 = df4.withColumn("am_pm", date_format("timestamp", "yyyy-MM-dd hh:mm:ss a"))
df4.show(truncate=False)

+-------------------+----------------------+
|timestamp          |am_pm                 |
+-------------------+----------------------+
|2022-01-15 08:30:45|2022-01-15 08:30:45 AM|
|2022-02-20 12:15:30|2022-02-20 12:15:30 PM|
|2022-03-25 18:45:15|2022-03-25 06:45:15 PM|
+-------------------+----------------------+



In [0]:
df4.withColumn("formatted_date",date_format("timestamp","dd-MM-yyyy HH:mm:ss")).show()

+-------------------+--------------------+-------------------+
|          timestamp|               am_pm|     formatted_date|
+-------------------+--------------------+-------------------+
|2022-01-15 08:30:45|2022-01-15 08:30:...|15-01-2022 08:30:45|
|2022-02-20 12:15:30|2022-02-20 12:15:...|20-02-2022 12:15:30|
|2022-03-25 18:45:15|2022-03-25 06:45:...|25-03-2022 18:45:15|
+-------------------+--------------------+-------------------+



In [0]:
df4.withColumn("abbrev_month",date_format("timestamp","dd/MMM/yyyy HH:mm:ss")).show()

+-------------------+--------------------+--------------------+
|          timestamp|               am_pm|        abbrev_month|
+-------------------+--------------------+--------------------+
|2022-01-15 08:30:45|2022-01-15 08:30:...|15/Jan/2022 08:30:45|
|2022-02-20 12:15:30|2022-02-20 12:15:...|20/Feb/2022 12:15:30|
|2022-03-25 18:45:15|2022-03-25 06:45:...|25/Mar/2022 18:45:15|
+-------------------+--------------------+--------------------+



In [0]:
df4.withColumn("month_year",date_format("timestamp","MMM-yyyy HH:mm:ss a")).show()

+-------------------+--------------------+--------------------+
|          timestamp|               am_pm|          month_year|
+-------------------+--------------------+--------------------+
|2022-01-15 08:30:45|2022-01-15 08:30:...|Jan-2022 08:30:45 AM|
|2022-02-20 12:15:30|2022-02-20 12:15:...|Feb-2022 12:15:30 PM|
|2022-03-25 18:45:15|2022-03-25 06:45:...|Mar-2022 18:45:15 PM|
+-------------------+--------------------+--------------------+



In [0]:
df4.withColumn("day_of_week",date_format("timestamp","EEEE")).show()

+-------------------+--------------------+-----------+
|          timestamp|               am_pm|day_of_week|
+-------------------+--------------------+-----------+
|2022-01-15 08:30:45|2022-01-15 08:30:...|   Saturday|
|2022-02-20 12:15:30|2022-02-20 12:15:...|     Sunday|
|2022-03-25 18:45:15|2022-03-25 06:45:...|     Friday|
+-------------------+--------------------+-----------+



In [0]:
df4.withColumn("day_of_week",date_format("timestamp","E")).show()

+-------------------+--------------------+-----------+
|          timestamp|               am_pm|day_of_week|
+-------------------+--------------------+-----------+
|2022-01-15 08:30:45|2022-01-15 08:30:...|        Sat|
|2022-02-20 12:15:30|2022-02-20 12:15:...|        Sun|
|2022-03-25 18:45:15|2022-03-25 06:45:...|        Fri|
+-------------------+--------------------+-----------+

