# Aggregating DataFrames in PySpark HW

First let's start up our PySpark instance

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("aggregate").getOrCreate()

spark

## Read in the dataFrame for this Notebook

In [4]:
airbnb = spark.read.csv('../data/nyc_air_bnb.csv',inferSchema=True,header=True)


## About this dataset

This dataset describes the listing activity and metrics for Air BNB bookers in NYC, NY for 2019. Each line in the dataset is a booking. 

**Source:** https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/data

Let's go ahead and view the first few records of the dataset so we know what we are working with.

In [5]:
airbnb.limit(5).toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


Now print the schema so we can make sure all the variables have the correct types

In [6]:
airbnb.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: integer (nullable = true)



Notice here that some of the columns that are obviously numeric have been incorrectly identified as "strings". Let's edit that. Otherwise we cannot aggregate any of the numeric columns.

In [8]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

df = airbnb.withColumn("id",airbnb.id.cast(IntegerType()))\
        .withColumn("host_id",airbnb.host_id.cast(IntegerType()))\
        .withColumn("latitude",airbnb.latitude.cast(FloatType()))\
        .withColumn("longitude",airbnb.longitude.cast(FloatType()))\
        .withColumn("price",airbnb.price.cast(IntegerType()))\
        .withColumn("minimum_nights",airbnb.minimum_nights.cast(IntegerType()))\
        .withColumn("number_of_reviews",airbnb.number_of_reviews.cast(IntegerType()))\
        .withColumn("reviews_per_month",airbnb.reviews_per_month.cast(FloatType()))\
        .withColumn("calculated_host_listings_count",airbnb.calculated_host_listings_count.cast(IntegerType()))\
        .withColumn("last_review",airbnb.last_review.cast(DateType()))


### Alright now we are ready to dig in!


### 1. How many rows are in this dataset?

In [9]:
#number of rows = counts
df.count()

49079

In [12]:
df.createOrReplaceTempView("airbnb") #for sql practice

### 2. How many total reviews does each host have?

In [10]:
df.columns

['id',
 'name',
 'host_id',
 'host_name',
 'neighbourhood_group',
 'neighbourhood',
 'latitude',
 'longitude',
 'room_type',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'last_review',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [16]:
spark.sql("SELECT host_id,sum(number_of_reviews) as review from airbnb group by host_id").show()

+--------+------+
| host_id|review|
+--------+------+
|  291112|    35|
| 1384111|   103|
| 1597481|    13|
| 2108853|    18|
| 2429432|    27|
| 2530670|   134|
| 3432742|     2|
| 1360296|    13|
| 2124690|     1|
| 6414252|     1|
| 9637768|    47|
| 9947836|    87|
| 9430366|   159|
| 7974574|     5|
| 5907325|    32|
|13749425|   198|
| 5771331|    20|
| 9784206|   204|
| 4702135|     0|
|19239110|    25|
+--------+------+
only showing top 20 rows



In [21]:
df.groupBy("host_id").agg(sum("number_of_reviews").alias("reviews")).show()

+--------+-------+
| host_id|reviews|
+--------+-------+
|  291112|     35|
| 1384111|    103|
| 1597481|     13|
| 2108853|     18|
| 2429432|     27|
| 2530670|    134|
| 3432742|      2|
| 1360296|     13|
| 2124690|      1|
| 6414252|      1|
| 9637768|     47|
| 9947836|     87|
| 9430366|    159|
| 7974574|      5|
| 5907325|     32|
|13749425|    198|
| 5771331|     20|
| 9784206|    204|
| 4702135|      0|
|19239110|     25|
+--------+-------+
only showing top 20 rows



### 3. Show the min and max of all the numeric variables in the dataset

In [26]:
df.select("price","minimum_nights","reviews_per_month","calculated_host_listings_count","availability_365").summary("min","max").toPandas()

Unnamed: 0,summary,price,minimum_nights,reviews_per_month,calculated_host_listings_count,availability_365
0,min,-74,0,0.0,0,0
1,max,10000,1250,58.5,365,365


In [30]:
spark.sql("SELECT min(price),min(calculated_host_listings_count) from airbnb").show() #inefficient..

+----------+-----------------------------------+
|min(price)|min(calculated_host_listings_count)|
+----------+-----------------------------------+
|       -74|                                  0|
+----------+-----------------------------------+



### 4. Which host had the highest number of reviews?

Only display the top result.

Bonus: format the column names

In [31]:
#sql
spark.sql("SELECT host_id,host_name from airbnb where number_of_reviews = (select max(number_of_reviews) from airbnb)").show()

+--------+---------+
| host_id|host_name|
+--------+---------+
|47621202|     Dona|
+--------+---------+



In [53]:
df.select("host_id","host_name").orderBy(df.number_of_reviews.desc()).limit(1).show()

#not quite accurate
                                        



+--------+---------+
| host_id|host_name|
+--------+---------+
|47621202|     Dona|
+--------+---------+



### 5. On average, how many nights did most hosts specify for a minimum?

In [64]:
#using expr
df.select(expr("round(avg(minimum_nights))").alias("average minimum stay")).show()

+--------------------+
|average minimum stay|
+--------------------+
|                 7.0|
+--------------------+



In [91]:
#using summary
df.select(df.minimum_nights).summary("mean").select(round("minimum_nights").alias("minimum")).show()

+-------+
|minimum|
+-------+
|    7.0|
+-------+



In [84]:
#sql
spark.sql("SELECT round(AVG(MINIMUM_NIGHTS)) as MINIMUM FROM AIRBNB").show()

+-------+
|MINIMUM|
+-------+
|    7.0|
+-------+



### 6. What is the most expensive neighborhood to stay in on average?

Note: only show the one result

In [121]:
#using df
df.groupBy("neighbourhood").agg(mean(df.price)).orderBy(desc("avg(price)")).limit(1).show()

+--------------+----------+
| neighbourhood|avg(price)|
+--------------+----------+
|Fort Wadsworth|     800.0|
+--------------+----------+



In [127]:
#sql
spark.sql("select neighbourhood,avg(price) from airbnb group by neighbourhood order by avg(price) desc limit 1").show()

+--------------+----------+
| neighbourhood|avg(price)|
+--------------+----------+
|Fort Wadsworth|     800.0|
+--------------+----------+



### 7. Display a two by two table that shows the average prices by room type (private and shared only) and neighborhood group (Manhattan and Brooklyn only)

In [164]:
df.select(regexp_extract(df.room_type,"[a-zA-Z _]*",0)).distinct().show()

+-----------------------------------------+
|regexp_extract(room_type, [a-zA-Z _]*, 0)|
+-----------------------------------------+
|                              Shared room|
|                                     null|
|                             Private room|
|                                         |
|                              Entire home|
|                                   Howard|
+-----------------------------------------+



In [260]:
#df
print(df.filter((df.room_type  == "Shared room") | (df.room_type== "Private room"))\
        .groupBy("room_type")\
        .pivot("neighbourhood_group",["Manhattan","Brooklyn"])\
        .mean("price").toPandas())


#or...
print(df.filter("room_type in ('Shared room','Private room')").groupBy("room_type").pivot("neighbourhood_group",['Manhattan' , 'Brooklyn']).mean("price").toPandas())

      room_type   Manhattan   Brooklyn
0   Shared room   89.069038  50.527845
1  Private room  116.054003  76.472340
      room_type   Manhattan   Brooklyn
0   Shared room   89.069038  50.527845
1  Private room  116.054003  76.472340


In [221]:
df.columns

['id',
 'name',
 'host_id',
 'host_name',
 'neighbourhood_group',
 'neighbourhood',
 'latitude',
 'longitude',
 'room_type',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'last_review',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [248]:
#sql

spark.sql("""SELECT * FROM 
                (SELECT room_type,neighbourhood_group,price from airbnb 
                WHERE room_type in ('Shared room','Private room') and 
                neighbourhood_group in ('Manhattan' , 'Brooklyn')
                ) as air
    
    PIVOT (
        avg(price) AS p
        FOR neighbourhood_group IN ('Manhattan' , 'Brooklyn'  )) 
        
    
    """).toPandas()



#much more difficult..


Unnamed: 0,room_type,Manhattan,Brooklyn
0,Shared room,89.069038,50.527845
1,Private room,116.054003,76.47234


In [250]:
#just to give you a better idea of how you pivot things..

spark.sql("""SELECT * FROM VALUES 
(101,'A',1000.01),
(101,'B',2000),
(101,'C',5000),
(102,'A',2000.01),
(102,'B',4000.1),
(103,'A',2000.01),
(103,'B',4000.1),
(101,'A',3000.01)
AS Sales(Employee,Product,Amount)
PIVOT (
SUM(Amount) AS amt, COUNT(Amount) AS cnt
FOR Product IN ( 'A' AS a, 'B' as b, 'C' AS c)
)
""").show()



#before you select, things must be there for the taking 
#so PIVOT clause can access. 


+--------+-------+-----+-------+-----+-------+-----+
|Employee|  a_amt|a_cnt|  b_amt|b_cnt|  c_amt|c_cnt|
+--------+-------+-----+-------+-----+-------+-----+
|     101|4000.02|    2|2000.00|    1|5000.00|    1|
|     103|2000.01|    1|4000.10|    1|   null| null|
|     102|2000.01|    1|4000.10|    1|   null| null|
+--------+-------+-----+-------+-----+-------+-----+



### Alright that's all folks!

### Great job!