In [0]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
          .appName('SparkByExamples.com') \
          .getOrCreate()
data = [("1","2019-07-01"),("2","2019-06-24"),("3","2019-08-24")]

df=spark.createDataFrame(data=data,schema=["id","date"])

from pyspark.sql.functions import *

df.select(
      col("date"),
      current_date().alias("current_date"),
      datediff(current_date(),col("date")).alias("datediff")
    ).show()

df.withColumn("datesDiff", datediff(current_date(),col("date"))) \
  .withColumn("montsDiff", months_between(current_date(),col("date"))) \
  .withColumn("montsDiff_round",round(months_between(current_date(),col("date")),2)) \
  .withColumn("yearsDiff",months_between(current_date(),col("date"))/lit(12)) \
  .withColumn("yearsDiff_round",round(months_between(current_date(),col("date"))/lit(12),2)) \
  .show()

data2 = [("1","07-01-2019"),("2","06-24-2019"),("3","08-24-2019")]  
df2=spark.createDataFrame(data=data2,schema=["id","date"])
df2.select(
    to_date(col("date"),"MM-dd-yyyy").alias("date"),
    current_date().alias("endDate")
    )

#SQL

spark.sql("select round(months_between('2019-07-01',current_date())/12,2) as years_diff").show()

+----------+------------+--------+
|      date|current_date|datediff|
+----------+------------+--------+
|2019-07-01|  2023-06-19|    1449|
|2019-06-24|  2023-06-19|    1456|
|2019-08-24|  2023-06-19|    1395|
+----------+------------+--------+

+---+----------+---------+-----------+---------------+------------------+---------------+
| id|      date|datesDiff|  montsDiff|montsDiff_round|         yearsDiff|yearsDiff_round|
+---+----------+---------+-----------+---------------+------------------+---------------+
|  1|2019-07-01|     1449|47.58064516|          47.58|3.9650537633333336|           3.97|
|  2|2019-06-24|     1456|47.83870968|          47.84|3.9865591400000002|           3.99|
|  3|2019-08-24|     1395|45.83870968|          45.84|3.8198924733333333|           3.82|
+---+----------+---------+-----------+---------------+------------------+---------------+

+----------+
|years_diff|
+----------+
|     -3.97|
+----------+



In [0]:
#It imports the necessary modules: SparkSession from pyspark.sql and functions (col, current_date, datediff, months_between, round, lit, to_date) from pyspark.sql.functions.
#It creates a SparkSession object named spark using SparkSession.builder.appName('SparkByExamples.com').getOrCreate().
#It creates a DataFrame df with two columns: "id" and "date", and three rows of data.
#It uses various date-related functions from pyspark.sql.functions to perform operations on the "date" column of the DataFrame. The functions used include current_date, datediff, months_between, round, lit, to_date.
#It shows the result of each select operation, displaying the computed values.
#The code demonstrates how to calculate the difference between dates, both in terms of days (datediff) and months (months_between). It also showcases how to round the computed values and perform additional calculations, such as dividing the months difference by 12 to get the years difference.

#Additionally, the code includes an example with a different date format ("MM-dd-yyyy") where to_date is used to convert the string to a date format.

#Finally, the code shows an SQL query that calculates the years difference between a specific date and the current date using the months_between function and the round function for rounding.