In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
df = spark.table("airbnb1.silver.silver_data")

In [0]:
df.display()

#### count of price_category

In [0]:
price_category_count = df.groupBy('price_category').count().sort(col("count").desc())
display(price_category_count)

In [0]:
price_category_count.write.format("delta").mode("overwrite").saveAsTable("airbnb1.gold.price_category_count")

#### Pricing by room type

In [0]:
price_by_room_type = df.groupBy('room_type')\
                       .agg(round(avg('price'),2).alias('avg_price'),
                            min('price').alias('min_price'),
                            max('price').alias('max_price'),
                            count('*').alias('count'))
display(price_by_room_type)

In [0]:
price_by_room_type.write.format("delta").mode("overwrite").saveAsTable("airbnb1.gold.price_by_room_type")

#### rating count

In [0]:
df = df.withColumn("rating_category",
                   when(col("rating") >= 90,"Excellent")\
                   .when((col("rating") >= 80) & (col("rating") < 90),"Good")\
                   .when((col("rating") >= 60) & (col("rating") < 80),"Bad")\
                    .otherwise("Very Bad"))
display(df)

In [0]:
rating_count = df.groupBy('rating_category').count().sort(col("count").desc())
display(rating_count)
rating_count.write.format("delta").mode("overwrite").saveAsTable("airbnb1.gold.rating_count")

#### rating vs price

In [0]:
price_rating = df.groupBy('rating_category')\
                 .agg(max('price').alias('max_price'),
                      min('price').alias('min_price'),
                      round(avg('price'),2).alias('avg_price'),
                      count('*').alias('count'))
display(price_rating)
price_rating.write.format("delta").mode("overwrite").saveAsTable("airbnb1.gold.price_rating")

In [0]:
host_price = df.groupBy("host_name")\
               .agg(max('price').alias('max_price'),
                    min('price').alias('min_price'),
                    round(avg('price'),2).alias('avg_price'),
                    count('*').alias('count'))
display(host_price)
host_price.write.format("delta").mode("overwrite").saveAsTable("airbnb1.gold.host_price")


#### count of cancellation policy

In [0]:
cancellation_count = df.groupBy('cancellation_policy')\
                       .agg(count('*').alias("Cancellation_count"))\
                       .orderBy(desc('Cancellation_count'))
display(cancellation_count)
cancellation_count.write.format("delta").mode("overwrite").saveAsTable("airbnb1.gold.cancellation_count")

#### max and min nights for room type

In [0]:
nights_roomtype = df.groupBy('room_type')\
                    .agg(max('maximum_nights').alias("max_nights"),
                         min('minimum_nights').alias("min_nights"))
display(nights_roomtype)


In [0]:
df.columns

In [0]:
ml_df = df.select("property_type","room_type","accommodates","bedrooms","bathrooms","number_of_reviews","beds","price")

In [0]:
ml_df.display()

In [0]:
ml_df.write.format("delta")\
           .mode("overwrite")\
           .saveAsTable("airbnb1.gold.ml_df")