In [1]:
import findspark

In [2]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F

In [4]:
spark = SparkSession.builder \
.appName("Aggregations") \
.master("local[2]") \
.getOrCreate()

In [5]:
# Data source: https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe

In [6]:
#! wget -P ~/datasets \
#https://github.com/erkansirin78/datasets/raw/master/Hotel_Reviews.csv.gz

In [7]:
! ls -l ~/datasets | grep Hotel

-rw-rw-r--. 1 train train  46401315 Oct  6 12:18 Hotel_Reviews.csv.gz


In [8]:
from pyspark.sql.types import *

programmatical_schema = StructType([
        StructField("Hotel_Address",StringType(),True),
        StructField("Additional_Number_of_Scoring",IntegerType(),True),
        StructField("Review_Date",StringType(),True),
        StructField("Average_Score",FloatType(),True),
        StructField("Hotel_Name",StringType(),True),
        StructField("Reviewer_Nationality",StringType(),True),
        StructField("Negative_Review",StringType(),True),
        StructField("Review_Total_Negative_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews",IntegerType(),True),
        StructField("Positive_Review",StringType(),True),
        StructField("Review_Total_Positive_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews_Reviewer_Has_Given",IntegerType(),True),
        StructField("Reviewer_Score",FloatType(),True),
        StructField("Tags",StringType(),True),
        StructField("days_since_review",StringType(),True),
        StructField("lat",FloatType(),True),
        StructField("lng",FloatType(),True)
    ])

# StructField("Tags",ArrayType(StringType()),True)
# Actually Tags should be array but csv cannot store array type.
# So you have to define it as StringType 

# Review_Date is still StringType() and should be DateType() 
# But for the moment we intentioally leave it StringType()
# As soon as we put schema on data we will modify it.

In [9]:
df = spark.read.option("header", True) \
.schema(programmatical_schema) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [10]:
# Now we can correct Tags datatype
# But we have to do additional preperation before cast with split.
# And cast Review_Date to date
df2 = df.withColumn("Tags", 
                     F.split(F.col("Tags"), ",")
                     .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [11]:
df2.limit(2).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968


In [12]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: float (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: float (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lng: float (nullable = true)



# COUNT, AVG, MIN, MAX

In [13]:
# Find latest and earliest Review_Date

df2.select(F.min("Review_date"), F.max("Review_Date")).show(3)

+----------------+----------------+
|min(Review_date)|max(Review_Date)|
+----------------+----------------+
|      2015-08-04|      2017-08-03|
+----------------+----------------+



In [14]:
df2.selectExpr("MIN(Review_Date) AS Earliest_RD", "MAX(Review_Date) AS Latest_RD").show(3)

+-----------+----------+
|Earliest_RD| Latest_RD|
+-----------+----------+
| 2015-08-04|2017-08-03|
+-----------+----------+



In [15]:
# Find average Negative and positive review word counts

df2.select(F.avg("Review_Total_Negative_Word_Counts").alias("AvgTNWC"), 
           F.avg("Review_Total_Positive_Word_Counts").alias("AvgTPWC")).show(3)

+------------------+-----------------+
|           AvgTNWC|          AvgTPWC|
+------------------+-----------------+
|18.539450263505888|17.77645820164502|
+------------------+-----------------+



# GROUP BY

In [17]:
# Review counts across years

df2.select('Review_Date','Average_Score').withColumn("Year", F.year("Review_Date")) \
.groupBy('Year') \
.agg(F.sum('Average_Score').alias("Total_Average_Score")) \
.show()

+----+-------------------+
|Year|Total_Average_Score|
+----+-------------------+
|2015|  791701.2014770508|
|2016| 2221477.5038752556|
|2017| 1317724.4031033516|
+----+-------------------+



In [None]:
df2.select('Review_Date').withColumn("Year", F.year("Review_Date")) \
.groupBy('Year') \
.agg(F.count('*').alias("Total_Reviews")) \
.show()

# ORDER BY

In [18]:
# Total_Review yıllara
df2.select('Review_Date').withColumn("Year", F.year("Review_Date")) \
.groupBy('Year') \
.agg(F.count('*').alias("Total_Reviews")) \
.orderBy('Total_Reviews') \
.show()

+----+-------------+
|Year|Total_Reviews|
+----+-------------+
|2015|        94527|
|2017|       156808|
|2016|       264403|
+----+-------------+



In [19]:
# descending order

df2.select('Review_Date').withColumn("Year", F.year("Review_Date")) \
.groupBy('Year') \
.agg(F.count('*').alias("Total_Reviews")) \
.orderBy(F.desc('Total_Reviews')) \
.show()

+----+-------------+
|Year|Total_Reviews|
+----+-------------+
|2016|       264403|
|2017|       156808|
|2015|        94527|
+----+-------------+



## multiple aggregation functions


In [20]:
# Find total and average Reviewer_Scores across years

df2.select('Review_Date','Reviewer_Score').withColumn("Year", F.year("Review_Date")) \
.groupBy('Year') \
.agg(F.count('*').alias("Total_Reviews"), F.avg('Reviewer_Score').alias("AVG_Reviewer_Score")) \
.orderBy(F.desc('Total_Reviews')) \
.show()

+----+-------------+------------------+
|Year|Total_Reviews|AVG_Reviewer_Score|
+----+-------------+------------------+
|2016|       264403| 8.424700240521899|
|2017|       156808|  8.39047886745722|
|2015|        94527| 8.319843073781177|
+----+-------------+------------------+



In [21]:
# Find average Reviewer_Scores across day of week, desc order by AVG score
df2.select('Review_Date','Reviewer_Score').withColumn("Day", F.dayofweek("Review_Date")) \
.groupBy('Day') \
.agg(F.count('*').alias("Total_Reviews"), F.avg('Reviewer_Score').alias("AVG_Reviewer_Score")) \
.orderBy(F.desc('AVG_Reviewer_Score')) \
.show()

+---+-------------+------------------+
|Day|Total_Reviews|AVG_Reviewer_Score|
+---+-------------+------------------+
|  3|       120948| 8.444183518088053|
|  4|        58591| 8.405661343067354|
|  2|        81145| 8.392337241050269|
|  1|        83981| 8.390862285826119|
|  6|        44732| 8.373535788875781|
|  7|        51833| 8.371462261789713|
|  5|        74508| 8.344132239579093|
+---+-------------+------------------+



In [22]:
# What if we want to day names instead of day number
df2.select('Review_Date', F.date_format('Review_Date', 'E').alias('Day_of_Review'), 
           'Reviewer_Score') \
.groupBy('Day_of_Review') \
.agg(F.count('*').alias("Total_Reviews"), F.avg('Reviewer_Score').alias("AVG_Reviewer_Score")) \
.orderBy(F.desc('AVG_Reviewer_Score')) \
.show()

+-------------+-------------+------------------+
|Day_of_Review|Total_Reviews|AVG_Reviewer_Score|
+-------------+-------------+------------------+
|          Tue|       120948| 8.444183518088053|
|          Wed|        58591| 8.405661343067354|
|          Mon|        81145| 8.392337241050269|
|          Sun|        83981| 8.390862285826119|
|          Fri|        44732| 8.373535788875781|
|          Sat|        51833| 8.371462261789713|
|          Thu|        74508| 8.344132239579093|
+-------------+-------------+------------------+



In [24]:
spark.stop()