In [None]:
from pyspark.sql import SparkSession, functions as F

spark = SparkSession.builder \
    .appName('SparkSQL Example') \
    .getOrCreate()

In [2]:
london_listings_df = spark.read.csv(
    path='./data/airbnb-london-listings.csv.gz',
    header=True,
    inferSchema=True,
    sep=',',
    quote='"',
    escape='"',
    multiLine=True,
    mode='PERMISSIVE'
)

london_reviews_df = spark.read.csv(
    path='./data/airbnb-london-reviews.csv.gz',
    header=True,
    inferSchema=True,
    sep=',',
    quote='"',
    escape='"',
    multiLine=True,
    mode='PERMISSIVE'
)

In [3]:
london_listings_reviews = london_listings_df.join(
    london_reviews_df,
    on=london_listings_df.id == london_reviews_df.listing_id,
    how='inner'
)

In [4]:
reviews_per_listings = london_listings_reviews \
    .groupby(london_listings_df.id, london_listings_df.name) \
    .agg(
        F.count(london_reviews_df.id).alias('reviews_count')
    ) \
    .orderBy('reviews_count', ascending=False) \
    .show(10, truncate=False)

+--------+--------------------------------------------------+-------------+
|id      |name                                              |reviews_count|
+--------+--------------------------------------------------+-------------+
|47408549|Double Room+ Ensuite                              |1902         |
|43120947|Private double room with en suite facilities      |1647         |
|19670926|Locke Studio Apartment at Leman Locke             |1443         |
|2126708 |London's best transport hub 5 mins walk! Safe too!|1142         |
|46233904|Superior Studio, avg size 23.5 msq                |1002         |
|2659707 |Large Room + Private Bathroom, E3.                |998          |
|27833488|S - Heathrow Airport Terminal 2 3 4 5 Hatton Cross|951          |
|4748665 |Single bedroom near London Stratford              |933          |
|42081759|Micro Studio at Locke at Broken Wharf             |914          |
|5266466 |Large London Room, Ensuite Bathroom,TV & Breakfast|909          |
+--------+--

In [5]:
london_listings_df.createOrReplaceTempView('listings')
london_reviews_df.createOrReplaceTempView('reviews')

In [6]:
query = """
SELECT
    l.id,
    l.name,
    COUNT(r.id) AS reviews_count
FROM 
    listings AS l
INNER JOIN 
    reviews as r
ON 
    l.id = r.listing_id
GROUP BY 
    l.id, l.name
ORDER BY
    3 DESC
"""

reviews_per_listing = spark.sql(sqlQuery=query)
reviews_per_listing.show(10, truncate=False)

+--------+--------------------------------------------------+-------------+
|id      |name                                              |reviews_count|
+--------+--------------------------------------------------+-------------+
|47408549|Double Room+ Ensuite                              |1902         |
|43120947|Private double room with en suite facilities      |1647         |
|19670926|Locke Studio Apartment at Leman Locke             |1443         |
|2126708 |London's best transport hub 5 mins walk! Safe too!|1142         |
|46233904|Superior Studio, avg size 23.5 msq                |1002         |
|2659707 |Large Room + Private Bathroom, E3.                |998          |
|27833488|S - Heathrow Airport Terminal 2 3 4 5 Hatton Cross|951          |
|4748665 |Single bedroom near London Stratford              |933          |
|42081759|Micro Studio at Locke at Broken Wharf             |914          |
|5266466 |Large London Room, Ensuite Bathroom,TV & Breakfast|909          |
+--------+--

In [7]:
spark.stop()