In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read Inside Airbnb data") \
    .getOrCreate()

listings = spark.read.csv("data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)

                                                                                

In [3]:
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

In [6]:
listings.filter(listings.picture_url.isNotNull()) \
    .select('picture_url') \
    .limit(1) \
    .show(truncate=False)

+-------------------------------------------------------------------------+
|picture_url                                                              |
+-------------------------------------------------------------------------+
|https://a0.muscache.com/pictures/623d63f8-56cf-4bd0-af95-fb50c5abf6af.jpg|
+-------------------------------------------------------------------------+



In [14]:
listings.filter(listings.reviews_per_month > 10) \
    .count()

101

In [15]:
listings.filter(listings.bathrooms > listings.bedrooms) \
    .select('name', 'bathrooms', 'bedrooms') \
    .show(truncate=False)

+-------------------------------------------------+---------+--------+
|name                                             |bathrooms|bedrooms|
+-------------------------------------------------+---------+--------+
|matrimoni red room on Canals center              |2.5      |1       |
|Charming Terrace Flat with Veranda, 15 min Centre|1.0      |0       |
|Luxury B&B Antica Brera                          |2.0      |1       |
|AtticCozy-With Rooftop Terrace                   |2.5      |1       |
|Centric studio Brera [Downtown Milan]            |1.0      |0       |
|TOP POSITION  Bed and Breakfast                  |2.0      |1       |
|LARGE DOUBLE OR TWIN ROOM                        |1.5      |1       |
|2 B&B in Milan with trees & metro                |2.0      |1       |
|Elegant, quiet and luminous flat                 |2.0      |1       |
|Glamour next to Leonardo's Cenacolo              |1.5      |1       |
|Beautiful loft in the Navigli area               |2.5      |2       |
|LOFT-

In [32]:
from pyspark.sql.functions import regexp_replace

price_num = listings \
    .withColumn('price_numeric', regexp_replace('price', '[$,]', '').cast('float'))
    
res = price_num \
    .filter(price_num.price_numeric > 5000) \
    .select('name', 'price') \
    .collect()

res

[Row(name='Panoramic duplex penthouse in Milan city center', price='$11,400.00'),
 Row(name='Food Loft by Simone Rugiati', price='$5,706.00'),
 Row(name='Comfort Queen Single - Bianca Maria Palace Hotel', price='$5,001.00'),
 Row(name='Bianca Maria Palace Hotel, Deluxe Doppia/Twin', price='$8,990.00'),
 Row(name='Exclusive Corner - Bianca Maria Palace Hotel', price='$5,001.00'),
 Row(name='Family Room - Bianca Maria Palace Hotel', price='$5,001.00'),
 Row(name='Gamma TEST no prenotazioni', price='$9,006.00'),
 Row(name='San Babila orangerie double bedroom apartment', price='$16,199.00'),
 Row(name='Grazioso e comodo appartamento accanto alla Metro', price='$65,500.00'),
 Row(name='Sweet Chloe Milano', price='$9,715.00'),
 Row(name='Il bilocale degli architetti', price='$9,000.00'),
 Row(name='Crocetta10', price='$9,000.00'),
 Row(name='Beautiful Studio, near Loreto Metro - (M2 Loreto)', price='$6,900.00'),
 Row(name='[Milano Centro] Green oasis few steps from Duomo', price='$6,879.00')

In [36]:
price_num.filter('price_numeric < 150 AND number_of_reviews > 20 AND review_scores_rating > 4.5') \
    .select('name', 'price_numeric', 'number_of_reviews', 'review_scores_rating') \
    .show(truncate=False)

+--------------------------------------------------+-------------+-----------------+--------------------+
|name                                              |price_numeric|number_of_reviews|review_scores_rating|
+--------------------------------------------------+-------------+-----------------+--------------------+
|Nico & Cinzia's Pink Suite!                       |86.0         |39               |4.53                |
|Nice room with private bathroom                   |36.0         |690              |4.65                |
|Brand new apartment in Milan                      |60.0         |126              |4.85                |
|Piccolo Attico Duomo // 3-18 months stay contract |143.0        |79               |4.91                |
|Apartment corso Buenos Aires milan                |71.0         |137              |4.68                |
|Stay in Style Solari                              |95.0         |108              |4.77                |
|Charming Terrace Flat with Veranda, 15 min Ce

In [35]:
price_num.filter('price_numeric < 150 OR bathrooms > 1') \
    .select('name', 'price_numeric', 'bathrooms') \
    .show(truncate=False)

+-------------------------------------------------+-------------+---------+
|name                                             |price_numeric|bathrooms|
+-------------------------------------------------+-------------+---------+
|Nico & Cinzia's Pink Suite!                      |86.0         |1.0      |
|Nice room with private bathroom                  |36.0         |1.0      |
|Great comfortable & quite APT. x 4 people!       |250.0        |2.0      |
|Flat " Chiesa Rossa2 in Milan                    |80.0         |1.0      |
|Open Space Loft Navigli Area Milano              |63.0         |1.0      |
|Brand new apartment in Milan                     |60.0         |1.0      |
|Piccolo Attico Duomo // 3-18 months stay contract|143.0        |1.0      |
|Apartment corso Buenos Aires milan               |71.0         |1.0      |
|Stay in Style Solari                             |95.0         |1.0      |
|matrimoni red room on Canals center              |44.0         |2.5      |
|downtown, R

In [41]:
from pyspark.sql.functions import max

price_num \
    .select(max('price_numeric')) \
    .show()

+------------------+
|max(price_numeric)|
+------------------+
|           65500.0|
+------------------+



In [47]:
from pyspark.sql.functions import max

res = price_num \
    .select(max('price_numeric').alias('max_price')) \
    .collect()

res

[Row(max_price=65500.0)]

In [48]:
max_price = res[0]['max_price']
max_price

65500.0

In [50]:
price_num.filter(price_num.price_numeric == max_price) \
    .select('name', 'price') \
    .show(truncate=False)

+-------------------------------------------------+----------+
|name                                             |price     |
+-------------------------------------------------+----------+
|Grazioso e comodo appartamento accanto alla Metro|$65,500.00|
+-------------------------------------------------+----------+



In [51]:
listings \
    .select('host_name') \
    .distinct() \
    .count()

                                                                                

2745

In [53]:
from pyspark.sql.functions import year

listings.filter(year(listings.first_review) == 2024) \
    .select('name', 'first_review') \
    .show(20, truncate=False)

+--------------------------------------------------+------------+
|name                                              |first_review|
+--------------------------------------------------+------------+
|Fiera, Rho fiera,Salone del Mobile                |2024-03-17  |
|Luxury top floor in porta Venezia                 |2024-04-21  |
|Charming Central Isola Apartment with Terrace     |2024-07-17  |
|Cozy 1 bedroom flat in Gambara area               |2024-10-03  |
|Home° Portello°SanSiro°FieraAllianzMiCo°CityLife  |2024-06-23  |
|Cozy apartment near city center                   |2024-04-21  |
|Monolocale vicino Porta Romana                    |2024-03-31  |
|Cosy flat next to Bocconi / Fondazione Prada      |2024-09-01  |
|Green Open Space                                  |2024-01-11  |
|Your place close to SALONE !                      |2024-04-22  |
|Bilocale sui Navigli - metro M2                   |2024-04-01  |
|Casa e terrazzo verde in centro                   |2024-04-19  |
|Sunny stu