In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('Data Aggregation and Joining') \
    .getOrCreate()

In [2]:
london_listing = spark \
    .read \
    .csv(path='data/airbnb-london-listings.csv.gz',
        header=True,
        inferSchema=True,
        sep=',',
        quote='"',
        escape='"',
        multiLine=True,
        mode="PERMISSIVE")

In [3]:
london_listing \
    .groupBy(london_listing.property_type) \
    .count() \
    .show(truncate=False)

+----------------------------------+-----+
|property_type                     |count|
+----------------------------------+-----+
|Private room in lighthouse        |2    |
|Private room in loft              |153  |
|Private room in earthen home      |2    |
|Entire chalet                     |4    |
|Earthen home                      |1    |
|Farm stay                         |4    |
|Entire rental unit                |41215|
|Shared room in hostel             |66   |
|Shared room                       |1    |
|Private room in condo             |3189 |
|Room in boutique hotel            |217  |
|Private room in religious building|4    |
|Room in bed and breakfast         |18   |
|Private room in casa particular   |56   |
|Private room in bungalow          |63   |
|Entire cabin                      |43   |
|Entire guesthouse                 |221  |
|Hut                               |3    |
|Private room in nature lodge      |4    |
|Entire guest suite                |177  |
+----------

In [4]:
import pyspark.sql.functions as F

london_listing \
    .groupby(london_listing.property_type) \
    .agg(
        F.count(london_listing.property_type).alias('count')
    ) \
    .orderBy('count', ascending=[False]) \
    .show(truncate=False)

+----------------------------------+-----+
|property_type                     |count|
+----------------------------------+-----+
|Entire rental unit                |41215|
|Private room in rental unit       |14464|
|Private room in home              |11704|
|Entire home                       |9120 |
|Entire condo                      |8250 |
|Private room in condo             |3189 |
|Entire serviced apartment         |1874 |
|Private room in townhouse         |1195 |
|Room in hotel                     |1113 |
|Entire townhouse                  |1058 |
|Private room in bed and breakfast |491  |
|Private room in guesthouse        |377  |
|Entire loft                       |341  |
|Entire guesthouse                 |221  |
|Room in boutique hotel            |217  |
|Entire guest suite                |177  |
|Private room in guest suite       |174  |
|Private room in loft              |153  |
|Private room in serviced apartment|132  |
|Private room                      |103  |
+----------

In [5]:
london_listing \
    .groupby(london_listing.property_type) \
    .agg(
        F.count(london_listing.property_type).alias('count'), 
        F.avg(london_listing.review_scores_location)
    ) \
    .orderBy('count', ascending=[False]) \
    .show(truncate=False)

+----------------------------------+-----+---------------------------+
|property_type                     |count|avg(review_scores_location)|
+----------------------------------+-----+---------------------------+
|Entire rental unit                |41215|4.727437046043664          |
|Private room in rental unit       |14464|4.726647667299953          |
|Private room in home              |11704|4.694735943407702          |
|Entire home                       |9120 |4.727462068965489          |
|Entire condo                      |8250 |4.770284788770394          |
|Private room in condo             |3189 |4.778348954578206          |
|Entire serviced apartment         |1874 |4.722610024449879          |
|Private room in townhouse         |1195 |4.766042471042479          |
|Room in hotel                     |1113 |4.608689075630254          |
|Entire townhouse                  |1058 |4.81762931034483           |
|Private room in bed and breakfast |491  |4.724728915662653          |
|Priva

In [6]:
london_reviews = spark.read.csv(path='data/airbnb-london-reviews.csv.gz',
                               header=True,
                               inferSchema=True,
                               sep=',',
                               quote='"',
                               escape='"',
                               multiLine=True,
                               mode='PERMISSIVE')

In [7]:
list(london_reviews.schema)

[StructField('listing_id', LongType(), True),
 StructField('id', LongType(), True),
 StructField('date', DateType(), True),
 StructField('reviewer_id', IntegerType(), True),
 StructField('reviewer_name', StringType(), True),
 StructField('comments', StringType(), True)]

In [8]:
london_listing_reviews = london_reviews \
    .join(
        other=london_listing,
        on=london_reviews.id == london_listing.id,
        how='inner'
    )

In [9]:
reviews_per_listings = london_listing_reviews \
    .groupby(london_listing.id, london_listing.name) \
    .agg(
        F.count(london_reviews.id).alias('num_reviews')
    ) \
    .orderBy('num_reviews', ascending=[False]) \
    .show(truncate=False)

+--------+--------------------------------------------------+-----------+
|id      |name                                              |num_reviews|
+--------+--------------------------------------------------+-----------+
|19825807|Lovely one bed garden flat in central Brixton     |1          |
|34115250|Quiet and cosy double bedroom in Shoreditch       |1          |
|39762289|Modern 1-bed  Flat - Brunswick Centre Russell Sq  |1          |
|13244001|Fabulous, Contemporary Garden Apartment in Balham |1          |
|17959833|Lovely private room with own bathroom             |1          |
|48805574|Homely 1-bed flat in zone 2 Nunhead               |1          |
|22063260|Light-filled quiet flat in lively Islington       |1          |
|38831578|Quiet room in residential area of Bow             |1          |
|21844951|Fantastic 2 Bedroom House in Queens Park          |1          |
|3654381 |Bright and spacious 1 bed flat                    |1          |
|17715368|Studio in Greenwich         

In [10]:
spark.stop()