In [0]:
spark

<pyspark.sql.connect.session.SparkSession at 0xff68ff08b050>

In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS hotel_db")
spark.sql("USE hotel_db")


DataFrame[]

In [0]:
spark.sql("SHOW DATABASES").show()


+------------------+
|      databaseName|
+------------------+
|           default|
|          hotel_db|
|information_schema|
+------------------+



In [0]:
hotels = [
    (1, "Hotel A", "New York", "US", 40.7, -74.0),
    (2, "Hotel B", "Paris", "FR", None, None),
    (3, "Hotel C", "London", "UK", 51.5, -0.1),
    (4, "Hotel D", "New York", "US", 40.7, -74.0)
]

cols = ["hotel_id","name","city","country","lat","lng"]

hotel_df = spark.createDataFrame(hotels, cols)

hotel_df.show()


+--------+-------+--------+-------+----+-----+
|hotel_id|   name|    city|country| lat|  lng|
+--------+-------+--------+-------+----+-----+
|       1|Hotel A|New York|     US|40.7|-74.0|
|       2|Hotel B|   Paris|     FR|NULL| NULL|
|       3|Hotel C|  London|     UK|51.5| -0.1|
|       4|Hotel D|New York|     US|40.7|-74.0|
+--------+-------+--------+-------+----+-----+



Data Frame operations

In [0]:
hotel_df.select("hotel_id","name","city","country").show()

+--------+-------+--------+-------+
|hotel_id|   name|    city|country|
+--------+-------+--------+-------+
|       1|Hotel A|New York|     US|
|       2|Hotel B|   Paris|     FR|
|       3|Hotel C|  London|     UK|
|       4|Hotel D|New York|     US|
+--------+-------+--------+-------+



In [0]:
hotel_df.filter("city = 'New York'").show()


+--------+-------+--------+-------+----+-----+
|hotel_id|   name|    city|country| lat|  lng|
+--------+-------+--------+-------+----+-----+
|       1|Hotel A|New York|     US|40.7|-74.0|
|       4|Hotel D|New York|     US|40.7|-74.0|
+--------+-------+--------+-------+----+-----+



Aggregations

In [0]:
hotel_df.groupBy("city").count().show()


+--------+-----+
|    city|count|
+--------+-----+
|New York|    2|
|   Paris|    1|
|  London|    1|
+--------+-----+



Joins 

In [0]:
weather = [
    ("New York","2025-01-01", 5),
    ("Paris","2025-01-01", 7),
    ("London","2025-01-01", 3)
]

weather_df = spark.createDataFrame(weather, ["city","date","temp"])


In [0]:
hotel_df.join(weather_df, "city").show()


+--------+--------+-------+-------+----+-----+----------+----+
|    city|hotel_id|   name|country| lat|  lng|      date|temp|
+--------+--------+-------+-------+----+-----+----------+----+
|New York|       1|Hotel A|     US|40.7|-74.0|2025-01-01|   5|
|   Paris|       2|Hotel B|     FR|NULL| NULL|2025-01-01|   7|
|  London|       3|Hotel C|     UK|51.5| -0.1|2025-01-01|   3|
|New York|       4|Hotel D|     US|40.7|-74.0|2025-01-01|   5|
+--------+--------+-------+-------+----+-----+----------+----+



Window function


In [0]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F

w = Window.partitionBy("city").orderBy("date")

weather_df.withColumn(
    "rank",
    F.row_number().over(w)
).show()


+--------+----------+----+----+
|    city|      date|temp|rank|
+--------+----------+----+----+
|  London|2025-01-01|   3|   1|
|New York|2025-01-01|   5|   1|
|   Paris|2025-01-01|   7|   1|
+--------+----------+----+----+



In [0]:
hotel_df.createOrReplaceTempView('hotel_df')
spark.sql("select * from hotel_df")

DataFrame[hotel_id: bigint, name: string, city: string, country: string, lat: double, lng: double]