In [21]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Data Aggregation") \
    .getOrCreate()

listings = spark.read.csv("data/listings.csv.gz",
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

In [22]:
listings \
    .groupby(listings.property_type) \
    .count() \
    .show(truncate=False)

+-------------------------------+-----+
|property_type                  |count|
+-------------------------------+-----+
|Private room in loft           |46   |
|Entire chalet                  |1    |
|Entire rental unit             |14414|
|Private room in minsu          |1    |
|Shared room in hostel          |16   |
|Private room in condo          |564  |
|Room in boutique hotel         |47   |
|Room in bed and breakfast      |6    |
|Private room in casa particular|5    |
|Entire cabin                   |1    |
|Private room in nature lodge   |1    |
|Entire guest suite             |21   |
|Private room in home           |83   |
|Entire place                   |10   |
|Camper/RV                      |2    |
|Tiny home                      |16   |
|Entire vacation home           |272  |
|Private room in camper/rv      |1    |
|Private room in hostel         |25   |
|Lighthouse                     |1    |
+-------------------------------+-----+
only showing top 20 rows


In [23]:
import pyspark.sql.functions as F

listings \
    .groupby(listings.property_type) \
    .agg(
        F.count('property_type').alias('count')
    ) \
    .orderBy('count', ascending=[False]) \
    .show(truncate=False)

+---------------------------------+-----+
|property_type                    |count|
+---------------------------------+-----+
|Entire rental unit               |14414|
|Entire condo                     |4319 |
|Private room in rental unit      |1480 |
|Private room in condo            |564  |
|Entire loft                      |444  |
|Entire home                      |367  |
|Entire vacation home             |272  |
|Entire serviced apartment        |244  |
|Private room in bed and breakfast|184  |
|Room in hotel                    |146  |
|Private room in home             |83   |
|Room in boutique hotel           |47   |
|Private room in loft             |46   |
|Private room in guesthouse       |28   |
|Private room in hostel           |25   |
|Private room in vacation home    |23   |
|Entire guest suite               |21   |
|Private room in villa            |21   |
|Shared room in rental unit       |19   |
|Shared room in hostel            |16   |
+---------------------------------

In [24]:
listings \
    .groupby(listings.property_type) \
    .agg(
        F.count('property_type').alias('count'),
        F.avg('review_scores_location')
    ) \
    .orderBy('count', ascending=[False]) \
    .show(truncate=False)

+---------------------------------+-----+---------------------------+
|property_type                    |count|avg(review_scores_location)|
+---------------------------------+-----+---------------------------+
|Entire rental unit               |14414|4.717307471500011          |
|Entire condo                     |4319 |4.749171656686611          |
|Private room in rental unit      |1480 |4.669619410085644          |
|Private room in condo            |564  |4.723645621181265          |
|Entire loft                      |444  |4.711267942583738          |
|Entire home                      |367  |4.7258688524590164         |
|Entire vacation home             |272  |4.741479999999996          |
|Entire serviced apartment        |244  |4.790233644859812          |
|Private room in bed and breakfast|184  |4.626814159292035          |
|Room in hotel                    |146  |4.547125000000004          |
|Private room in home             |83   |4.68063492063492           |
|Room in boutique ho

In [25]:
reviews = spark.read.csv("data/reviews.csv.gz",
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

                                                                                

In [17]:
for field in reviews.schema:
    print(field)

StructField('listing_id', LongType(), True)
StructField('id', LongType(), True)
StructField('date', DateType(), True)
StructField('reviewer_id', IntegerType(), True)
StructField('reviewer_name', StringType(), True)
StructField('comments', StringType(), True)


In [27]:
listings_reviews = listings.join(
    reviews, listings.id == reviews.listing_id, how='inner'
)

In [28]:
listings_reviews \
    .groupBy('id') \
    .agg(
        F.count('id').alias('num_reviews')
    ) \
    .show()

{"ts": "2026-01-31 00:39:58.482", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[AMBIGUOUS_REFERENCE] Reference `id` is ambiguous, could be: [`id`, `id`]. SQLSTATE: 42704", "context": {"file": "java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)", "line": "", "fragment": "col", "errorClass": "AMBIGUOUS_REFERENCE"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o245.agg.\n: org.apache.spark.sql.AnalysisException: [AMBIGUOUS_REFERENCE] Reference `id` is ambiguous, could be: [`id`, `id`]. SQLSTATE: 42704\n\tat org.apache.spark.sql.errors.QueryCompilationErrors$.ambiguousReferenceError(QueryCompilationErrors.scala:2232)\n\tat org.apache.spark.sql.catalyst.expressions.package$AttributeSeq.resolve(package.scala:356)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveChildren(LogicalPlan.scala:164)\n\tat org.apache.spark.sql.catalyst.analysis.ColumnResolutionHelper.$anonfun$resolveExpressionByP

AnalysisException: [AMBIGUOUS_REFERENCE] Reference `id` is ambiguous, could be: [`id`, `id`]. SQLSTATE: 42704

In [29]:
reviews_per_listing = listings_reviews \
    .groupBy(listings.id, listings.name) \
    .agg(
        F.count(reviews.id).alias('num_reviews')
    ) \
    .orderBy('num_reviews', ascending=False) \
    .show(truncate=False)

[Stage 39:>                                                         (0 + 1) / 1]

+--------+-------------------------------------------------+-----------+
|id      |name                                             |num_reviews|
+--------+-------------------------------------------------+-----------+
|833602  |Beautiful flat in Milano center !!!              |1395       |
|5098399 |Sant'Ambrogio Central&Quiet                      |1143       |
|1482235 |Flat in Milan's Shopping District                |1132       |
|15961311|[Navigli-Tortona] Modern flat with Wi-Fi and AC  |1123       |
|2701066 |Room with a beautiful garden                     |1118       |
|6271080 |Apartment in the heart of Milan                  |1093       |
|20761132|MilanRentals - Soperga- Milan Central Station    |1065       |
|5257587 |Naviglio : The sound of Silence                  |1041       |
|19221782|DUOMO Luxury with Terrace in Prestigious Building|1011       |
|768969  |Central Milan Wifi Flat & Breakfast              |994        |
|6350702 |Central Station charming flat, wifi      

                                                                                