In [66]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, translate

In [4]:
spark = (SparkSession
.builder
.appName("Spark_ML_APP")
.enableHiveSupport() # we need this to create tables
.getOrCreate())
sc = spark.sparkContext
sc.setLogLevel("WARN")

In [8]:
filepath ="""../data/sf-airbnb.csv"""

In [34]:
# inferring schema from a small sample
sample_df = spark.read.option("samplingRation",0.001)\
            .option("header",True).option("multiline",True).csv(filepath)
sample_df.printSchema()
airbnb_schema = sample_df.schema
print(len(airbnb_schema.fieldNames())) # -> 106


root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: string (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- name: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- space: string (nullable = true)
 |-- description: string (nullable = true)
 |-- experiences_offered: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- notes: string (nullable = true)
 |-- transit: string (nullable = true)
 |-- access: string (nullable = true)
 |-- interaction: string (nullable = true)
 |-- house_rules: string (nullable = true)
 |-- thumbnail_url: string (nullable = true)
 |-- medium_url: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- xl_picture_url: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable

In [60]:
sample_df.select('amenities').collect()[0]
# we have double quotes at the begining, and we're losing data... 

Row(amenities='"{TV,""Cable TV""')

In [55]:
# get the sf-fire data from reading directly the file
airbnb_df = spark.read.csv(filepath, header = True, schema= airbnb_schema,multiLine=True,escape='"')
# Multiline -> parse one record, which may span multiple lines, per file. CSV built-in functions ignore this option.
# escape -> will remove double quotes at the begining

In [59]:
airbnb_df.select('amenities').collect()[0]

Row(amenities='{TV,"Cable TV",Internet,Wifi,Kitchen,"Pets live on this property",Dog(s),Heating,"Family/kid friendly",Washer,Dryer,"Smoke detector","Carbon monoxide detector","First aid kit",Essentials,Shampoo,"24-hour check-in",Hangers,"Hair dryer",Iron,"Laptop friendly workspace","Self check-in",Keypad,"Private entrance","Pack ’n Play/travel crib","Room-darkening shades"}')

In [61]:
# we have many columns, so we'll just keep this in line with the book
columnsToKeep = [
  "host_is_superhost",
  "cancellation_policy",
  "instant_bookable",
  "host_total_listings_count",
  "neighbourhood_cleansed",
  "latitude",
  "longitude",
  "property_type",
  "room_type",
  "accommodates",
  "bathrooms",
  "bedrooms",
  "beds",
  "bed_type",
  "minimum_nights",
  "number_of_reviews",
  "review_scores_rating",
  "review_scores_accuracy",
  "review_scores_cleanliness",
  "review_scores_checkin",
  "review_scores_communication",
  "review_scores_location",
  "review_scores_value",
  "price"]

In [62]:
airbnb_df = airbnb_df.select(columnsToKeep)
airbnb_df.cache().count()
display(airbnb_df)

DataFrame[host_is_superhost: string, cancellation_policy: string, instant_bookable: string, host_total_listings_count: string, neighbourhood_cleansed: string, latitude: string, longitude: string, property_type: string, room_type: string, accommodates: string, bathrooms: string, bedrooms: string, beds: string, bed_type: string, minimum_nights: string, number_of_reviews: string, review_scores_rating: string, review_scores_accuracy: string, review_scores_cleanliness: string, review_scores_checkin: string, review_scores_communication: string, review_scores_location: string, review_scores_value: string, price: string]

We have several quantitive categories as string. E.g.  host_total_listings_count, bedrooms, price, etc.

In [63]:
airbnb_df.select("host_total_listings_count").collect()[0]

Row(host_total_listings_count='1')

In [64]:
airbnb_df.select("price").collect()[0]

Row(price='$170.00')

In [70]:
airbnb_df = airbnb_df.withColumn("host_total_listings_count", col("host_total_listings_count").cast("int"))
airbnb_df = airbnb_df.withColumn("bedrooms", col("bedrooms").cast("int"))
airbnb_df = airbnb_df.withColumn("price", translate(col("price"), "$,", "").cast("double"))
# we could clean more, but this is just a p-o-c.
display(airbnb_df)


DataFrame[host_is_superhost: string, cancellation_policy: string, instant_bookable: string, host_total_listings_count: int, neighbourhood_cleansed: string, latitude: string, longitude: string, property_type: string, room_type: string, accommodates: string, bathrooms: string, bedrooms: int, beds: string, bed_type: string, minimum_nights: string, number_of_reviews: string, review_scores_rating: string, review_scores_accuracy: string, review_scores_cleanliness: string, review_scores_checkin: string, review_scores_communication: string, review_scores_location: string, review_scores_value: string, price: double]

In [77]:
airbnb_df.select("price").summary().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|              7151|
|   mean| 213.6540344007831|
| stddev|313.28222046853125|
|    min|               0.0|
|    25%|             100.0|
|    50%|             150.0|
|    75%|             235.0|
|    max|           10000.0|
+-------+------------------+



In [79]:
airbnb_df.summary().show()

[Stage 58:>                                                         (0 + 1) / 1]

+-------+-----------------+-------------------+----------------+-------------------------+----------------------+--------------------+--------------------+-------------+---------------+------------------+------------------+------------------+------------------+--------+------------------+-----------------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+------------------+
|summary|host_is_superhost|cancellation_policy|instant_bookable|host_total_listings_count|neighbourhood_cleansed|            latitude|           longitude|property_type|      room_type|      accommodates|         bathrooms|          bedrooms|              beds|bed_type|    minimum_nights|number_of_reviews|review_scores_rating|review_scores_accuracy|review_scores_cleanliness|review_scores_checkin|review_scores_communication|review_scores_location|review_scores_value|             price|
+-------+-----------

                                                                                