In [0]:
airbnb_df = spark.read.format('csv').option("header","true").option("inferSchema","true").load('/FileStore/tables/listings-1.csv')
display(airbnb_df.limit(5))

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2818,Quiet Garden View Room & Super Fast WiFi,3159,Daniel,,Oostelijk Havengebied - Indische Buurt,52.36575,4.94142,Private room,59,3,278,2020-02-14,1.98,1.0,0
20168,Studio with private bathroom in the centre 1,59484,Alexander,,Centrum-Oost,52.36424,4.89396,Private room,236,1,340,2020-04-09,2.63,2.0,0
25428,Lovely apt in City Centre (w.lift) near Jordaan,56142,Joan,,Centrum-West,52.37297,4.88339,Entire home/apt,125,14,5,2020-02-09,0.15,1.0,58
27886,"Romantic, stylish B&B houseboat in canal district",97647,Flip,,Centrum-West,52.38761,4.89188,Private room,138,2,219,2020-07-25,2.05,1.0,158
28871,Comfortable double room,124245,Edwin,,Centrum-West,52.36719,4.89092,Private room,75,2,336,2020-09-20,2.72,2.0,340


In [0]:
airbnb_df.count()

In [0]:
airbnb_df.dtypes

In [0]:
#changing the schema of original dataset by casting the particular columns

from pyspark.sql.types import IntegerType
airbnb_df = airbnb_df.withColumn("host_id", airbnb_df["host_id"].cast(IntegerType())).withColumn("id",airbnb_df["id"].cast(IntegerType()))
airbnb_df.dtypes

In [0]:
#adding new columns storing true/false values for integerType check on columns -> id/host_id 

import pyspark.sql.functions as F

airbnb_df = airbnb_df.withColumn(
  "value_host_id",
  F.col("host_id").cast("int").isNotNull()).withColumn("value_id",F.col("id").cast("int").isNotNull())

airbnb_df.printSchema()

In [0]:
from pyspark.sql.functions import when, count, col
null_df = airbnb_df.select([count(when(col(c).isNull(), c)).alias(c) for c in 
           airbnb_df.columns])
display(null_df)

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,value_host_id,value_id
26,35,53,104,18781,31,26,26,26,26,26,29,2316,2313,26,48,0,0


In [0]:
temp = airbnb_df.na.fill(-1)
display(temp.filter((temp.id == -1) & (temp.host_id == -1)))
#airbnb_df.count()

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,value_host_id,value_id
-1,1344947.0,-1,,De Pijp - Rivierenbuurt,52.34426,4.88722,Private room,155.0,2.0,90,2020-08-15,0.87,2.0,353.0,-1,False,False
-1,29582869.0,-1,,Noord-West,52.41495,4.88927,Entire home/apt,190.0,7.0,1,2016-08-16,0.02,1.0,0.0,-1,False,False
-1,58563168.0,-1,,Zuid,52.35344,4.87972,Entire home/apt,280.0,1.0,6,2017-05-06,0.11,1.0,0.0,-1,False,False
-1,79619999.0,-1,,Noord-Oost,52.39515,4.95879,Entire home/apt,35.0,1.0,12,2017-08-22,0.24,1.0,0.0,-1,False,False
-1,123847675.0,-1,,De Aker - Nieuw Sloten,52.34776,4.77764,Entire home/apt,185.0,5.0,3,2018-08-03,0.07,1.0,0.0,-1,False,False
-1,28290949.0,-1,,De Baarsjes - Oud-West,52.37031,4.86755,Entire home/apt,221.0,2.0,13,2019-04-18,0.31,1.0,285.0,-1,False,False
-1,129374601.0,-1,,Bos en Lommer,52.38095,4.85233,Entire home/apt,75.0,1.0,1,2017-08-12,0.03,1.0,0.0,-1,False,False
-1,96060378.0,-1,,Noord-Oost,52.40619,4.93352,Entire home/apt,187.0,4.0,4,2018-11-04,0.1,1.0,167.0,-1,False,False
-1,4379827.0,-1,,Buitenveldert - Zuidas,52.32245,4.8591,Private room,80.0,1.0,20,2020-08-16,0.52,1.0,62.0,-1,False,False
-1,38505482.0,-1,,Zuid,52.34564,4.86213,Entire home/apt,90.0,3.0,0,,,1.0,282.0,-1,False,False


In [0]:
#display(temp.limit(5))
cleaned_airbnb_df = temp.filter((temp.id != -1) & (temp.host_id != -1))
cleaned_airbnb_df.count()

In [0]:
cleaned_airbnb_df.count()

In [0]:
'''airbnb_df.dropna(subset=("id","host_id"))
airbnb_df.count()'''

In [0]:
# creating DFs to store details where "id" & "host_id" is not of integer type

null_ids = airbnb_df.filter(F.col("value_id") == False )
null_host_id = airbnb_df.filter(F.col("value_host_id") == False )


In [0]:
#neighbourhood_group column is not completely null

display(airbnb_df.filter("neighbourhood_group is not null"))

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,value_host_id,value_id
,1344947,,,De Pijp - Rivierenbuurt,52.34426,4.88722,Private room,155.0,2,90,2020-08-15,0.87,2,353.0,,False,False
4713270.0,"""Houseboot """"Aphrodite""""",,36472542.0,Jamie,,Oud-Noord,52.40193,4.90184,Entire home/apt,149,2,9.0,2018-12-30,0.2,1.0,False,True
,29582869,,,Noord-West,52.41495,4.88927,Entire home/apt,190.0,7,1,2016-08-16,0.02,1,0.0,,False,False
,58563168,,,Zuid,52.35344,4.87972,Entire home/apt,280.0,1,6,2017-05-06,0.11,1,0.0,,False,False
,79619999,,,Noord-Oost,52.39515,4.95879,Entire home/apt,35.0,1,12,2017-08-22,0.24,1,0.0,,False,False
,123847675,,,De Aker - Nieuw Sloten,52.34776,4.77764,Entire home/apt,185.0,5,3,2018-08-03,0.07,1,0.0,,False,False
,28290949,,,De Baarsjes - Oud-West,52.37031,4.86755,Entire home/apt,221.0,2,13,2019-04-18,0.31,1,285.0,,False,False
,129374601,,,Bos en Lommer,52.38095,4.85233,Entire home/apt,75.0,1,1,2017-08-12,0.03,1,0.0,,False,False
18983615.0,"""Family Boathouse """"Bonnie""""",,517215.0,Rental Valley,,Centrum-West,52.3766,4.88607,Entire home/apt,90,1,15.0,2020-08-21,0.38,4.0,False,True
,96060378,,,Noord-Oost,52.40619,4.93352,Entire home/apt,187.0,4,4,2018-11-04,0.1,1,167.0,,False,False


In [0]:
#count(host_id) > 1 dataframe
airbnb_df.createOrReplaceTempView("host_id_count")
host_id_count = spark.sql("select host_id,count(*) as count from host_id_count group by host_id having count > 1 ")
display(host_id_count.limit(10))

host_id,count
55288093,3
231241796,2
21167882,14
56741226,2
15530883,3
39451523,2
107762360,2
118345810,3
130154926,2
151062320,2


In [0]:
import pyspark.sql.functions as f

max_booking_host_id = host_id_count.agg({"host_id":"max"}).collect()[0]
print (max_booking_host_id)

In [0]:
display(airbnb_df.filter(f.col("host_id") == 'Hotel'))

id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
"""",353565632,Hotel,,Centrum-Oost,52.36656,4.91127,Private room,41,1,32,2020-10-02,10.55,5,188.0,
"Single bed with private bathroom / toilet TV / coffee / tea / hairdryer / wardrobe / heating""",353565632,Hotel,,Centrum-Oost,52.36619,4.90889,Private room,31,1,46,2020-09-28,15.33,5,1.0,
