In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('Airbnb QandA Processing Data') \
    .getOrCreate()

In [2]:
london_listings = spark.read.csv(
    path='./data/airbnb-london-listings.csv.gz',
    header=True,
    inferSchema=True,
    sep=',',
    quote='"',
    escape='"',
    multiLine=True,
    mode='PERMISSIVE'
)

In [3]:
london_listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

In [4]:
# 1. Get a non-null picture URL for any property ("picture_url" field). Select any non-null picture URL

london_listings.filter(
    london_listings \
        .picture_url \
        .isNotNull()
) \
.select('picture_url') \
.limit(1) \
.show(truncate=False)

+------------------------------------------------------------------------------------------------------+
|picture_url                                                                                           |
+------------------------------------------------------------------------------------------------------+
|https://a0.muscache.com/pictures/miso/Hosting-13913/original/d755aa6d-cebb-4464-80be-2722c921e8d5.jpeg|
+------------------------------------------------------------------------------------------------------+



In [5]:
# 2. Get number of properties that get more than 10 reviews per month
london_listings \
    .filter(
        (london_listings.reviews_per_month > 10)
    ) \
    .count()

66

In [6]:
# 3. Get properties that have more bathrooms than bedrooms
london_listings \
    .filter(
        (london_listings.bathrooms > london_listings.bedrooms)
    ) \
    .select('name', 'bathrooms', 'bedrooms') \
    .show(truncate=False)

+-------------------------------------------------+---------+--------+
|name                                             |bathrooms|bedrooms|
+-------------------------------------------------+---------+--------+
|Cosy Double studio in Zone 2 Hammersmith (1)     |1.5      |1       |
|LONDON DETACHED HOUSE*ElecGates etc              |2.0      |1       |
|Designer room Park Views 4 mins zone 1 station   |1.5      |1       |
|Maisonette in Central London Zone 1              |1.5      |1       |
|West London,loft ensuite, 5min2tube              |1.5      |1       |
|Shoreditch Loft                                  |1.5      |1       |
|Five minute walk to South Bank                   |1.5      |1       |
|Stunning double room own bathroom                |4.0      |1       |
|Also five minutes to South Bank                  |1.5      |1       |
|Studio 20 min from center                        |1.0      |0       |
|Spacious luxury 2 bedroom apartment              |1.5      |1       |
|Singl

In [7]:
# 4. Get properties where the price is greater than 5,000. Collect the result as a Python list
from pyspark.sql.functions import regexp_replace

price_cleaned_df = london_listings \
    .withColumn('price_cleaned', regexp_replace('price', '[$,]', '').cast('float'))

In [8]:
price_cleaned_result = price_cleaned_df \
    .filter(
        (price_cleaned_df.price_cleaned > 5000)
    ) \
    .select('name', 'price') \
    .collect()

In [9]:
price_cleaned_result

[Row(name='Room in a cosy flat. Central, clean', price='$8,000.00'),
 Row(name='Spacious Private Ground Floor Room', price='$6,309.00'),
 Row(name='No Longer Available', price='$53,588.00'),
 Row(name='Bright & airy DoubleBed with EnSuite in Zone 2!', price='$74,100.00'),
 Row(name='The Apartments by The Sloane Club, Two Bedroom Apt', price='$7,377.00'),
 Row(name='The Apartments by The Sloane Club, L 2 Bedroom Apt', price='$7,377.00'),
 Row(name='Single room. 7ft x 9ft - Over looking garden', price='$6,523.00'),
 Row(name='Close To London Eye (TUR)', price='$6,666.00'),
 Row(name='Beautiful 2 BR flat in Kilburn with free parking', price='$6,000.00'),
 Row(name='Semi-detached mews house in Knightsbridge.', price='$7,019.00'),
 Row(name='Affordable Spacious  Room on the edge of the city', price='$6,000.00'),
 Row(name='Henry’s Townhouse, London', price='$6,500.00'),
 Row(name='City Suite', price='$5,353.00'),
 Row(name='Hyde Park Suite', price='$5,653.00'),
 Row(name='SHORT WALK TO LOND

In [10]:
# 5. Get a list of properties with the following characteristics:
# * price < 150
# * more than 20 reviews
# * review_scores_rating > 4.5
# Consider using the "&" operator

price_cleaned_df.filter(
    (price_cleaned_df.price_cleaned < 150) &
    (price_cleaned_df.number_of_reviews > 20) &
    (price_cleaned_df.review_scores_rating > 4.5)
) \
    .select('name', 'price', 'number_of_reviews', 'review_scores_rating') \
    .show(truncate=False)

+-------------------------------------------------+-------+-----------------+--------------------+
|name                                             |price  |number_of_reviews|review_scores_rating|
+-------------------------------------------------+-------+-----------------+--------------------+
|Holiday London DB Room Let-on going              |$70.00 |55               |4.85                |
|Bright Chelsea  Apartment. Chelsea!              |$149.00|97               |4.8                 |
|You are GUARANTEED to love this                  |$90.00 |730              |4.87                |
|SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST       |$61.00 |387              |4.77                |
|SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT         |$49.00 |72               |4.97                |
|Room with a view, shared flat,  central  Bankside|$96.00 |137              |4.7                 |
|You Will Save Money Here                         |$71.00 |639              |4.89                |
|Quiet Com

In [11]:
# 6. Get a list of properties with the following characteristics:
# * price < 150 OR more than one bathroom
# Use the "|" operator to implement the OR operator

price_cleaned_df.filter(
    (price_cleaned_df.price_cleaned < 150) |
    (price_cleaned_df.bathrooms > 1)
) \
    .select('name', 'price', 'bathrooms') \
    .show(truncate=False)

+--------------------------------------------------+-------+---------+
|name                                              |price  |bathrooms|
+--------------------------------------------------+-------+---------+
|Holiday London DB Room Let-on going               |$70.00 |1.0      |
|Bright Chelsea  Apartment. Chelsea!               |$149.00|1.0      |
|Very Central Modern 3-Bed/2 Bath By Oxford St W1  |$411.00|2.0      |
|Kew Gardens 3BR house in cul-de-sac               |$280.00|1.5      |
|You are GUARANTEED to love this                   |$90.00 |0.0      |
|SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST        |$61.00 |1.0      |
|Short Term Home                                   |$340.00|2.0      |
|SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT          |$49.00 |1.0      |
|Room with a view, shared flat,  central  Bankside |$96.00 |1.0      |
|You Will Save Money Here                          |$71.00 |1.0      |
|Quiet Comfortable Room in Fulham                  |$48.00 |1.0      |
|Room 

In [12]:
# 7. Get the highest listing price in this dataset
# Consider using the "max" function from "pyspark.sql.functions

from pyspark.sql.functions import max

price_cleaned_df \
    .select(max(price_cleaned_df.price_cleaned)) \
    .show()

+------------------+
|max(price_cleaned)|
+------------------+
|         1085147.0|
+------------------+



In [13]:
# 8. Get the name and a price of property with the highest price
# Try to use "collect" method to get the highest price first, and then use it in a "filter" call

max_price = price_cleaned_df \
    .select(max(price_cleaned_df.price_cleaned).alias('max_price')) \
    .collect()
max_price

price_cleaned_df.filter(
    (price_cleaned_df.price_cleaned == max_price[0]['max_price'])
) \
    .select('name', 'price') \
    .show(truncate=False)

+---------------------------------------------+-------------+
|name                                         |price        |
+---------------------------------------------+-------------+
|Lux 2 Bed in Canary Wharf close to Excel & O2|$1,085,147.00|
+---------------------------------------------+-------------+



In [14]:
# 9. Get the number of hosts in the dataset

london_listings \
    .select(london_listings.host_name) \
    .distinct() \
    .count()

16673

In [15]:
# 10. Get listings with a first review in 2024
# Consider using the "year" function from "pyspark.sql.functions"

from  pyspark.sql.functions import year

london_listings \
    .filter(
        year(london_listings.first_review) == 2024
    ) \
    .select('name', 'price', 'first_review') \
    .show(truncate=False)

+--------------------------------------------------+---------+------------+
|name                                              |price    |first_review|
+--------------------------------------------------+---------+------------+
|Close to Wimbledon All England Tennis -huge double|$86.00   |2024-08-11  |
|one Double bed room with en-suite facilities      |$53.00   |2024-03-21  |
|Bridgerton inspired cottage core apartment        |$91.00   |2024-09-14  |
|Sm double room  with own bathroom                 |$38.00   |2024-06-04  |
|Central, modern pied-a-terre                      |$189.00  |2024-11-29  |
|Superlux flat in Knightsbridge                    |$1,350.00|2024-01-01  |
|The Pink House, Notting Hill                      |NULL     |2024-07-14  |
|Stylish garden flat in Hackney                    |$197.00  |2024-09-15  |
|Luxurious Flat in South Kensington                |$234.00  |2024-06-19  |
|Double Standard Room (Ensuite)                    |NULL     |2024-09-01  |
|Single En-s

In [16]:
spark.stop()