In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Reading Airbnb data from insideairbnb.com") \
    .getOrCreate()

listings = spark.read.csv("data/listings.csv.gz",
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

26/01/29 15:59:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

In [2]:
review_locations = listings.select(listings.review_scores_location)
review_locations.show()

+----------------------+
|review_scores_location|
+----------------------+
|                  4.69|
|                  4.45|
|                  4.34|
|                  4.18|
|                  4.43|
|                  4.75|
|                  4.29|
|                  4.78|
|                   4.9|
|                  4.89|
|                   4.7|
|                  4.64|
|                   4.0|
|                  4.65|
|                  4.45|
|                  4.25|
|                   4.9|
|                  4.83|
|                  4.75|
|                  4.89|
+----------------------+
only showing top 20 rows


In [4]:
listings \
    .select(listings.review_scores_location) \
    .show()

+----------------------+
|review_scores_location|
+----------------------+
|                  4.69|
|                  4.45|
|                  4.34|
|                  4.18|
|                  4.43|
|                  4.75|
|                  4.29|
|                  4.78|
|                   4.9|
|                  4.89|
|                   4.7|
|                  4.64|
|                   4.0|
|                  4.65|
|                  4.45|
|                  4.25|
|                   4.9|
|                  4.83|
|                  4.75|
|                  4.89|
+----------------------+
only showing top 20 rows


In [10]:
#GET FROM LISTINGS ANY DATA WITH SCORE ABOVE 4.5 AND SEPARATE WITH FILTER

high_score_listings = listings \
    .filter(listings.review_scores_location > 4.5) \
    .select('id', 'price', 'name', 'review_scores_location')

high_score_listings.show(20, truncate=False)

+------+-------+--------------------------------------------------+----------------------+
|id    |price  |name                                              |review_scores_location|
+------+-------+--------------------------------------------------+----------------------+
|23986 |$180.00|" Characteristic Milanese flat"                   |4.69                  |
|101087|$80.00 |Flat " Chiesa Rossa2 in Milan                     |4.75                  |
|122514|$60.00 |Brand new apartment in Milan                      |4.78                  |
|141833|$143.00|Piccolo Attico Duomo // 3-18 months stay contract |4.9                   |
|143329|$71.00 |Apartment corso Buenos Aires milan                |4.89                  |
|153925|$95.00 |Stay in Style Solari                              |4.7                   |
|155720|$44.00 |matrimoni red room on Canals center               |4.64                  |
|171017|$61.00 |Charming Terrace Flat with Veranda, 15 min Centre |4.65                  |

In [9]:
high_score_listings.dropna().show(20, truncate=False) #DROPNA -> DROPS ANY NULL DATA FROM THE LIST

+------+-------+--------------------------------------------------+----------------------+
|id    |price  |name                                              |review_scores_location|
+------+-------+--------------------------------------------------+----------------------+
|23986 |$180.00|" Characteristic Milanese flat"                   |4.69                  |
|101087|$80.00 |Flat " Chiesa Rossa2 in Milan                     |4.75                  |
|122514|$60.00 |Brand new apartment in Milan                      |4.78                  |
|141833|$143.00|Piccolo Attico Duomo // 3-18 months stay contract |4.9                   |
|143329|$71.00 |Apartment corso Buenos Aires milan                |4.89                  |
|153925|$95.00 |Stay in Style Solari                              |4.7                   |
|155720|$44.00 |matrimoni red room on Canals center               |4.64                  |
|171017|$61.00 |Charming Terrace Flat with Veranda, 15 min Centre |4.65                  |

In [11]:
high_score_listings.schema['price']

StructField('price', StringType(), True)

In [13]:
# create a column called price_num, replace every $ and , to null and cast the string price to a float type

from pyspark.sql.functions import regexp_replace

price_num_df = listings \
    .withColumn('price_num', regexp_replace('price', '[$,]', '').cast('float')) \

price_num_df.schema['price_num']

StructField('price_num', FloatType(), True)

In [14]:
price_num_df \
    .select('price_num', 'name') \
    .show(20, truncate=False)

+---------+-------------------------------------------------+
|price_num|name                                             |
+---------+-------------------------------------------------+
|180.0    |" Characteristic Milanese flat"                  |
|NULL     |Giacinto Cosy & clean flat near MM1              |
|86.0     |Nico & Cinzia's Pink Suite!                      |
|36.0     |Nice room with private bathroom                  |
|250.0    |Great comfortable & quite APT. x 4 people!       |
|80.0     |Flat " Chiesa Rossa2 in Milan                    |
|63.0     |Open Space Loft Navigli Area Milano              |
|60.0     |Brand new apartment in Milan                     |
|143.0    |Piccolo Attico Duomo // 3-18 months stay contract|
|71.0     |Apartment corso Buenos Aires milan               |
|95.0     |Stay in Style Solari                             |
|44.0     |matrimoni red room on Canals center              |
|96.0     |downtown, Rho Fiera in 30 min directly by train  |
|61.0   

In [15]:
price_num_df.filter((price_num_df.price_num < 100) & (price_num_df.review_scores_location > 4.5)) \
    .select('name', 'price', 'review_scores_location') \
    .show(truncate=False)

+--------------------------------------------------+------+----------------------+
|name                                              |price |review_scores_location|
+--------------------------------------------------+------+----------------------+
|Flat " Chiesa Rossa2 in Milan                     |$80.00|4.75                  |
|Brand new apartment in Milan                      |$60.00|4.78                  |
|Apartment corso Buenos Aires milan                |$71.00|4.89                  |
|Stay in Style Solari                              |$95.00|4.7                   |
|matrimoni red room on Canals center               |$44.00|4.64                  |
|Charming Terrace Flat with Veranda, 15 min Centre |$61.00|4.65                  |
|private room with private bathroom                |$90.00|4.9                   |
|A corner of London in Milan                       |$99.00|4.75                  |
|Stanza Blu in Milan - Navigli Area                |$43.00|4.63                  |
|Cen

In [16]:
price_num_df.filter('price_num < 100 AND review_scores_location > 4.5') \
    .select('name', 'price', 'review_scores_location') \
    .show(truncate=False)

+--------------------------------------------------+------+----------------------+
|name                                              |price |review_scores_location|
+--------------------------------------------------+------+----------------------+
|Flat " Chiesa Rossa2 in Milan                     |$80.00|4.75                  |
|Brand new apartment in Milan                      |$60.00|4.78                  |
|Apartment corso Buenos Aires milan                |$71.00|4.89                  |
|Stay in Style Solari                              |$95.00|4.7                   |
|matrimoni red room on Canals center               |$44.00|4.64                  |
|Charming Terrace Flat with Veranda, 15 min Centre |$61.00|4.65                  |
|private room with private bathroom                |$90.00|4.9                   |
|A corner of London in Milan                       |$99.00|4.75                  |
|Stanza Blu in Milan - Navigli Area                |$43.00|4.63                  |
|Cen

In [18]:
# LIST ALL TYPES OF PROPERTIES AND DISTINCT() REMOVE DUPLICATE ROWS

listings \
    .select(listings.property_type) \
    .distinct() \
    .show(truncate=False)

[Stage 16:>                                                         (0 + 1) / 1]

+-------------------------------+
|property_type                  |
+-------------------------------+
|Private room in loft           |
|Entire chalet                  |
|Entire rental unit             |
|Private room in minsu          |
|Shared room in hostel          |
|Private room in condo          |
|Room in boutique hotel         |
|Room in bed and breakfast      |
|Private room in casa particular|
|Entire cabin                   |
|Private room in nature lodge   |
|Entire guest suite             |
|Private room in home           |
|Entire place                   |
|Camper/RV                      |
|Tiny home                      |
|Entire vacation home           |
|Private room in camper/rv      |
|Private room in hostel         |
|Lighthouse                     |
+-------------------------------+
only showing top 20 rows


                                                                                

In [22]:
# SAVE THE RESULT IN A FILE ON /DATA

listings \
    .select(listings.property_type) \
    .distinct() \
    .write \
    .csv('data/property_types')

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/home/msr/Data-Engineering-Bootcamp/projects/airbnb-data-filtering/data/property_types already exists. Set mode as "overwrite" to overwrite the existing path. SQLSTATE: 42K04