In [0]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
          .appName('SparkByExamples.com') \
          .getOrCreate()


dates = [("1","2019-07-01 12:01:19.111"),
    ("2","2019-06-24 12:01:19.222"),
    ("3","2019-11-16 16:44:55.406"),
    ("4","2019-11-16 16:50:59.406")
    ]

df = spark.createDataFrame(data=dates, schema=["id","from_timestamp"])

from pyspark.sql.functions import *
df2=df.withColumn('from_timestamp',to_timestamp(col('from_timestamp')))\
  .withColumn('end_timestamp', current_timestamp())\
  .withColumn('DiffInSeconds',col("end_timestamp").cast("long") - col('from_timestamp').cast("long"))
df2.show(truncate=False)

df.withColumn('from_timestamp',to_timestamp(col('from_timestamp')))\
  .withColumn('end_timestamp', current_timestamp())\
  .withColumn('DiffInSeconds',unix_timestamp("end_timestamp") - unix_timestamp('from_timestamp')) \
  .show(truncate=False)

df2.withColumn('DiffInMinutes',round(col('DiffInSeconds')/60))\
  .show(truncate=False)
  
df2.withColumn('DiffInHours',round(col('DiffInSeconds')/3600))\
  .show(truncate=False)
  
#Difference between two timestamps when input has just timestamp

data= [("12:01:19.000","13:01:19.000"),
    ("12:01:19.000","12:02:19.000"),
    ("16:44:55.406","17:44:55.406"),
    ("16:50:59.406","16:44:59.406")]
df3 = spark.createDataFrame(data=data, schema=["from_timestamp","to_timestamp"])

df3.withColumn("from_timestamp",to_timestamp(col("from_timestamp"),"HH:mm:ss.SSS")) \
   .withColumn("to_timestamp",to_timestamp(col("to_timestamp"),"HH:mm:ss.SSS")) \
   .withColumn("DiffInSeconds", col("from_timestamp").cast("long") - col("to_timestamp").cast("long")) \
   .withColumn("DiffInMinutes",round(col("DiffInSeconds")/60)) \
   .withColumn("DiffInHours",round(col("DiffInSeconds")/3600)) \
   .show(truncate=False)
   
#


df3 = spark.createDataFrame(
        data=[("1","07-01-2019 12:01:19.406")], 
        schema=["id","input_timestamp"]
        )
df3.withColumn("input_timestamp",to_timestamp(col("input_timestamp"),"MM-dd-yyyy HH:mm:ss.SSS")) \
    .withColumn("current_timestamp",current_timestamp().alias("current_timestamp")) \
    .withColumn("DiffInSeconds",current_timestamp().cast("long") - col("input_timestamp").cast("long")) \
    .withColumn("DiffInMinutes",round(col("DiffInSeconds")/60)) \
    .withColumn("DiffInHours",round(col("DiffInSeconds")/3600)) \
    .withColumn("DiffInDays",round(col("DiffInSeconds")/24*3600)) \
    .show(truncate=False)
    
#SQL

spark.sql("select unix_timestamp('2019-07-02 12:01:19') - unix_timestamp('2019-07-01 12:01:19') DiffInSeconds").show()
spark.sql("select (unix_timestamp('2019-07-02 12:01:19') - unix_timestamp('2019-07-01 12:01:19'))/60 DiffInMinutes").show()
spark.sql("select (unix_timestamp('2019-07-02 12:01:19') - unix_timestamp('2019-07-01 12:01:19'))/3600 DiffInHours").show()

+---+-----------------------+-----------------------+-------------+
|id |from_timestamp         |end_timestamp          |DiffInSeconds|
+---+-----------------------+-----------------------+-------------+
|1  |2019-07-01 12:01:19.111|2023-06-21 16:54:32.462|125383993    |
|2  |2019-06-24 12:01:19.222|2023-06-21 16:54:32.462|125988793    |
|3  |2019-11-16 16:44:55.406|2023-06-21 16:54:32.462|113443777    |
|4  |2019-11-16 16:50:59.406|2023-06-21 16:54:32.462|113443413    |
+---+-----------------------+-----------------------+-------------+

+---+-----------------------+-----------------------+-------------+
|id |from_timestamp         |end_timestamp          |DiffInSeconds|
+---+-----------------------+-----------------------+-------------+
|1  |2019-07-01 12:01:19.111|2023-06-21 16:54:33.373|125383994    |
|2  |2019-06-24 12:01:19.222|2023-06-21 16:54:33.373|125988794    |
|3  |2019-11-16 16:44:55.406|2023-06-21 16:54:33.373|113443778    |
|4  |2019-11-16 16:50:59.406|2023-06-21 16:54:3

In [0]:
#The code imports necessary modules, including SparkSession from pyspark.sql and various functions related to timestamp manipulation.

#A SparkSession is created with the application name set to 'SparkByExamples.com'.

#Two DataFrames are created: df and df3, representing different timestamp scenarios.

#Operations on df:

#The from_timestamp column is converted to a timestamp type using the to_timestamp() function.
#The end_timestamp column is set to the current timestamp using the current_timestamp() function.
#The DiffInSeconds column is computed by calculating the difference between end_timestamp and from_timestamp in seconds.
#Similar operations are performed on df, but with additional calculations of differences in minutes and hours.

#Operations on df3:

#The from_timestamp and to_timestamp columns are converted to timestamp types using the to_timestamp() function.
#The DiffInSeconds column is computed by calculating the difference between from_timestamp and to_timestamp in seconds.
#Similar calculations are performed to obtain differences in minutes and hours.
#The DataFrames df2, df3, and their calculated columns are displayed using the show() method.

#SQL operations:

#The code uses Spark SQL to perform similar calculations as above, using the unix_timestamp() function to convert timestamps to Unix timestamps and then calculating the differences in seconds, minutes, and hours.
#Overall, this code showcases different ways to work with timestamps in PySpark. It demonstrates how to convert string representations of timestamps to actual timestamp types, calculate differences between timestamps, and perform various operations using both DataFrame functions and Spark SQL.
