In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round
from pyspark.sql.functions import to_timestamp, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

schema = StructType([
            StructField("input_timestamp", StringType(), True)])

dates = ['2019-07-01 12:01:19.111',
    '2019-06-24 12:01:19.222',
    '2019-11-16 16:44:55.406',
    '2019-11-16 16:50:59.406']

df = spark.createDataFrame(list( zip(dates)), schema=schema)

df.withColumn('input_timestamp',to_timestamp(col('input_timestamp')))\
  .withColumn('current_timestamp', current_timestamp().alias('current_timestamp'))\
  .withColumn('DiffInSeconds',current_timestamp().cast(LongType()) - col('input_timestamp').cast(LongType()))\
  .withColumn('DiffInMinutes',round(col('DiffInSeconds')/60))\
  .withColumn('DiffInHours',round(col('DiffInSeconds')/3600))\
  .withColumn('DiffInDays',round(col('DiffInSeconds')/24*3600))\
  .show()

+--------------------+--------------------+-------------+-------------+-----------+-------------+
|     input_timestamp|   current_timestamp|DiffInSeconds|DiffInMinutes|DiffInHours|   DiffInDays|
+--------------------+--------------------+-------------+-------------+-----------+-------------+
|2019-07-01 12:01:...|2023-06-21 16:17:...|    125381800|    2089697.0|    34828.0|  1.880727E10|
|2019-06-24 12:01:...|2023-06-21 16:17:...|    125986600|    2099777.0|    34996.0|  1.889799E10|
|2019-11-16 16:44:...|2023-06-21 16:17:...|    113441584|    1890693.0|    31512.0|1.70162376E10|
|2019-11-16 16:50:...|2023-06-21 16:17:...|    113441220|    1890687.0|    31511.0| 1.7016183E10|
+--------------------+--------------------+-------------+-------------+-----------+-------------+



In [0]:
#The code begins by importing the necessary libraries and creating a SparkSession using SparkSession.builder.appName('SparkByExamples.com').getOrCreate().

#A schema is defined using StructType and StructField, specifying the name and data type of the column.

#The dates list contains timestamp values as strings.

#The DataFrame df is created using spark.createDataFrame(list(zip(dates)), schema=schema). The zip() function combines the elements from the dates list, and the resulting list of tuples is passed to createDataFrame() along with the specified schema.

#The DataFrame operations are performed in a chain:

#a. The withColumn() function is used to convert the "input_timestamp" column from string to timestamp format using to_timestamp(col('input_timestamp')).

#b. The withColumn() function is used to add a new column named "current_timestamp" containing the current timestamp value using current_timestamp().alias('current_timestamp').

#c. The withColumn() function is used to calculate the time difference in seconds between the "current_timestamp" and "input_timestamp" columns. It subtracts the "input_timestamp" column from the "current_timestamp" column and casts the result to LongType().

#d. The withColumn() function is used to calculate the time difference in minutes by dividing the "DiffInSeconds" column by 60 and rounding the result using round(col('DiffInSeconds')/60).

#e. The withColumn() function is used to calculate the time difference in hours by dividing the "DiffInSeconds" column by 3600 (60 seconds * 60 minutes) and rounding the result.

#f. The withColumn() function is used to calculate the time difference in days by dividing the "DiffInSeconds" column by (24 * 3600) (24 hours * 60 minutes * 60 seconds) and rounding the result.

#g. Finally, the show() function is called on the DataFrame to display the resulting columns and their values.

#Overall, the code showcases how to manipulate and calculate time differences using PySpark DataFrame operations. It converts string timestamps to actual timestamp data types, calculates time differences in seconds, minutes, hours, and days, and displays the results in a tabular format.
