In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Processing Airbnb Data") \
    .getOrCreate()

In [2]:
london_listings = spark.read.csv(
    path="./data/airbnb-london-listings.csv.gz",
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

In [3]:
review_locations = london_listings \
    .select(london_listings.review_scores_location)
review_locations.show()

+----------------------+
|review_scores_location|
+----------------------+
|                  4.78|
|                  4.93|
|                  4.89|
|                   4.6|
|                  4.85|
|                   4.9|
|                  4.77|
|                  4.53|
|                  4.79|
|                  4.79|
|                   4.5|
|                  4.64|
|                  4.84|
|                  4.86|
|                   4.0|
|                  4.75|
|                  NULL|
|                  4.66|
|                  4.67|
|                   5.0|
+----------------------+
only showing top 20 rows



In [4]:
high_score_listings = london_listings \
    .filter(london_listings.review_scores_location > 4.5) \
    .select('id', 'price', 'name', 'review_scores_location')

high_score_listings.dropna().show(truncate=False)

+-----+-------+--------------------------------------------------+----------------------+
|id   |price  |name                                              |review_scores_location|
+-----+-------+--------------------------------------------------+----------------------+
|13913|$70.00 |Holiday London DB Room Let-on going               |4.78                  |
|15400|$149.00|Bright Chelsea  Apartment. Chelsea!               |4.93                  |
|17402|$411.00|Very Central Modern 3-Bed/2 Bath By Oxford St W1  |4.89                  |
|36274|$210.00|Bright 1 bedroom apt off brick lane in Shoreditch |4.85                  |
|36299|$280.00|Kew Gardens 3BR house in cul-de-sac               |4.9                   |
|36660|$90.00 |You are GUARANTEED to love this                   |4.77                  |
|38605|$61.00 |SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST        |4.53                  |
|38610|$340.00|Short Term Home                                   |4.79                  |
|38995|$49

In [5]:
list(high_score_listings.schema)

[StructField('id', LongType(), True),
 StructField('price', StringType(), True),
 StructField('name', StringType(), True),
 StructField('review_scores_location', DoubleType(), True)]

In [6]:
from pyspark.sql.functions import regexp_replace

price_num_df = london_listings \
    .withColumn('price_num', regexp_replace('price', '[$,]', '').cast('float'))

price_num_df.schema['price_num']

StructField('price_num', FloatType(), True)

In [7]:
price_num_df \
    .select('price_num', 'name') \
    .show(truncate=False)

+---------+-------------------------------------------------+
|price_num|name                                             |
+---------+-------------------------------------------------+
|70.0     |Holiday London DB Room Let-on going              |
|149.0    |Bright Chelsea  Apartment. Chelsea!              |
|411.0    |Very Central Modern 3-Bed/2 Bath By Oxford St W1 |
|NULL     |Battersea live/work artist house                 |
|210.0    |Bright 1 bedroom apt off brick lane in Shoreditch|
|280.0    |Kew Gardens 3BR house in cul-de-sac              |
|90.0     |You are GUARANTEED to love this                  |
|61.0     |SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST       |
|340.0    |Short Term Home                                  |
|49.0     |SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT         |
|NULL     |Stylish bedsit in Notting Hill ish flat.         |
|213.0    |2 Double bed apartment in quiet area North London|
|NULL     |Room in maisonette in chiswick                   |
|96.0   

In [8]:
price_num_df \
    .filter((price_num_df.price_num < 100) & (price_num_df.review_scores_location > 4.5)) \
    .select('name', 'price', 'review_scores_location') \
    .show(truncate=False)

+--------------------------------------------------+------+----------------------+
|name                                              |price |review_scores_location|
+--------------------------------------------------+------+----------------------+
|Holiday London DB Room Let-on going               |$70.00|4.78                  |
|You are GUARANTEED to love this                   |$90.00|4.77                  |
|SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST        |$61.00|4.53                  |
|SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT          |$49.00|4.79                  |
|Room with a view, shared flat,  central  Bankside |$96.00|4.86                  |
|You Will Save Money Here                          |$71.00|4.75                  |
|Quiet Comfortable Room in Fulham                  |$48.00|4.66                  |
|Room with a garden                                |$76.00|5.0                   |
|Pleasant Single Room in zone 1.                   |$50.00|4.77                  |
|Cos

In [9]:
price_num_df \
    .filter('price_num < 100 AND review_scores_location > 4.5') \
    .select('name', 'price', 'review_scores_location') \
    .show(truncate=False)

+--------------------------------------------------+------+----------------------+
|name                                              |price |review_scores_location|
+--------------------------------------------------+------+----------------------+
|Holiday London DB Room Let-on going               |$70.00|4.78                  |
|You are GUARANTEED to love this                   |$90.00|4.77                  |
|SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST        |$61.00|4.53                  |
|SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT          |$49.00|4.79                  |
|Room with a view, shared flat,  central  Bankside |$96.00|4.86                  |
|You Will Save Money Here                          |$71.00|4.75                  |
|Quiet Comfortable Room in Fulham                  |$48.00|4.66                  |
|Room with a garden                                |$76.00|5.0                   |
|Pleasant Single Room in zone 1.                   |$50.00|4.77                  |
|Cos

In [10]:
london_listings \
    .select(london_listings.property_type) \
    .distinct() \
    .show(truncate=False)

+----------------------------------+
|property_type                     |
+----------------------------------+
|Private room in lighthouse        |
|Private room in loft              |
|Private room in earthen home      |
|Entire chalet                     |
|Earthen home                      |
|Farm stay                         |
|Entire rental unit                |
|Shared room in hostel             |
|Shared room                       |
|Private room in condo             |
|Room in boutique hotel            |
|Private room in religious building|
|Room in bed and breakfast         |
|Private room in casa particular   |
|Private room in bungalow          |
|Entire cabin                      |
|Entire guesthouse                 |
|Hut                               |
|Private room in nature lodge      |
|Entire guest suite                |
+----------------------------------+
only showing top 20 rows



In [11]:
london_listings \
    .select(london_listings.property_type, london_listings.room_type) \
    .distinct() \
    .show(truncate=False)

+----------------------------------+---------------+
|property_type                     |room_type      |
+----------------------------------+---------------+
|Room in hostel                    |Hotel room     |
|Private room in casa particular   |Private room   |
|Dome                              |Entire home/apt|
|Entire serviced apartment         |Entire home/apt|
|Private room in loft              |Private room   |
|Private room in villa             |Private room   |
|Farm stay                         |Entire home/apt|
|Room in hotel                     |Hotel room     |
|Shared room in rental unit        |Shared room    |
|Private room in guest suite       |Private room   |
|Room in rental unit               |Hotel room     |
|Room in serviced apartment        |Hotel room     |
|Private room in serviced apartment|Private room   |
|Private room in hostel            |Private room   |
|Shared room                       |Shared room    |
|Private room in yurt              |Private ro

In [12]:
london_listings \
    .select(london_listings.property_type) \
    .distinct() \
    .write \
    .csv(path="./data/created/", mode='overwrite')

In [13]:
spark.stop()