In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, lead

## SPARK SESSION

In [2]:
spark = SparkSession.builder.appName("presentation").enableHiveSupport().getOrCreate()

## 1) Load 2) Temporary View 3) Schema

In [None]:
# Load the DataFrame from the CSV file with headers
df = spark.read.csv("train.csv", header=True)

# Register the DataFrame as a temporary view
df.createOrReplaceTempView("train_view")

# Run an SQL query on the temporary view
result = spark.sql("SELECT * FROM train_view")  # Replace with your desired SQL query
result.show()

# Inspect the table schema
result=spark.sql("SHOW COLUMN FROM train_view")
result=spark.sql("SELECT * FROM train_view LIMIT 0")
result=spark.sql("DESCRIBE train_view")
result.show()

## 2) Rename a column using SQL query and DF dot Notation

In [None]:
df.createOrReplaceTempView("train_view")
df = spark.sql("SELECT id AS train_id, station, time FROM train_view")
df.show()

In [None]:
df2 = df.withColumnRenamed("id", "train_id")
df2.show()

In [None]:
df3 = df.select(col("id").alias("train_id"), "station", "time")
df3.show()

## 3) Adding an ID using Window Functions


In [None]:
# Write a SQL query to add a unique ID using ROW_NUMBER()
sql_query = """
SELECT train_id, station, time, ROW_NUMBER() OVER (PARTITION BY train_id ORDER BY time) AS id
FROM train_view
"""

# Execute the SQL query using Spark SQL
df4 = spark.sql(sql_query)
df4.show()

In [None]:
# Define a window specification
window_spec = Window.partitionBy("train_id").orderBy("time")

# Add a unique ID column using DataFrame dot notation
df5 = df.withColumn("id", row_number().over(window_spec))

df5.show()

## 4) Adding a new column with time until next stop

In [None]:
# Add a new column "time_next" using SQL query
sql_query = """
SELECT id, station, time,
       LEAD(time, 1) OVER (PARTITION BY id ORDER BY time) AS time_next
FROM train_view
"""

df_with_time_next = spark.sql(sql_query)

# Calculate the time difference using another SQL query
df_with_time_next.createOrReplaceTempView("train_view_with_time_next")

sql_query2 = """
SELECT id, station, time, time_next,
       UNIX_TIMESTAMP(time_next, 'HH:mm') - UNIX_TIMESTAMP(time, 'HH:mm') AS time_difference
FROM train_view_with_time_next
"""

df_with_time_difference = spark.sql(sql_query2)
df_with_time_difference.show()

In [None]:
# Define a window specification to order by time
window_spec = Window.partitionBy("id").orderBy("time")

# Add a new column "time_next" using LEAD function
df_with_time_next = df.withColumn("time_next", lead("time", 1).over(window_spec))

# Calculate the time difference between the current and next stop
df_with_time_difference = df_with_time_next.withColumn("time_difference",
                                                       (F.unix_timestamp("time_next", "HH:mm") -
                                                        F.unix_timestamp("time", "HH:mm")).cast("int"))

df_with_time_difference.show()