In [1]:
import findspark

In [2]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F

In [4]:
spark = SparkSession.builder \
.appName("Filter") \
.master("local[2]") \
.getOrCreate()

In [5]:
# Data source: https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe

In [6]:
#! wget -P ~/datasets \
#https://github.com/erkansirin78/datasets/raw/master/Hotel_Reviews.csv.gz

In [7]:
! ls -l ~/datasets | grep Hotel

-rw-rw-r--. 1 train train  46401315 Oct  6 12:18 Hotel_Reviews.csv.gz


In [8]:
from pyspark.sql.types import *

programmatical_schema = StructType([
        StructField("Hotel_Address",StringType(),True),
        StructField("Additional_Number_of_Scoring",IntegerType(),True),
        StructField("Review_Date",StringType(),True),
        StructField("Average_Score",FloatType(),True),
        StructField("Hotel_Name",StringType(),True),
        StructField("Reviewer_Nationality",StringType(),True),
        StructField("Negative_Review",StringType(),True),
        StructField("Review_Total_Negative_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews",IntegerType(),True),
        StructField("Positive_Review",StringType(),True),
        StructField("Review_Total_Positive_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews_Reviewer_Has_Given",IntegerType(),True),
        StructField("Reviewer_Score",FloatType(),True),
        StructField("Tags",StringType(),True),
        StructField("days_since_review",StringType(),True),
        StructField("lat",FloatType(),True),
        StructField("lng",FloatType(),True)
    ])

# StructField("Tags",ArrayType(StringType()),True)
# Actually Tags should be array but csv cannot store array type.
# So you have to define it as StringType 

# Review_Date is still StringType() and should be DateType() 
# But for the moment we intentioally leave it StringType()
# As soon as we put schema on data we will modify it.

In [9]:
df = spark.read.option("header", True) \
.schema(programmatical_schema) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [10]:
# Now we can correct Tags datatype
# But we have to do additional preperation before cast with split.
# And cast Review_Date to date
df2 = df.withColumn("Tags", 
                     F.split(F.col("Tags"), ",")
                     .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [11]:
df2.limit(5).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[[' Leisure trip ', ' Family with young child...",3 days,52.360577,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",3 days,52.360577,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-24,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[[' Leisure trip ', ' Couple ', ' Suite ', ...",10 days,52.360577,4.915968


In [12]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: float (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: float (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lng: float (nullable = true)



In [13]:
# where() is an alias for filter() and equivalent of SQL WHERE clause

In [13]:
# Filter Average_Score lt 6.0
df2.filter("Average_Score < 6.0 ").limit(5).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,Viale Bligny 56 Milan City Center 20136 Milan ...,26,2017-06-11,5.2,Hotel Liberty,United Kingdom,The room is really dirty dirt and dust are ev...,41,465,No Positive,0,2,2.5,"[[' Leisure trip ', ' Solo traveler ', ' Sin...",53 days,45.450867,9.194866
1,Viale Bligny 56 Milan City Center 20136 Milan ...,26,2017-06-04,5.2,Hotel Liberty,China,There is no wi fi in the rooms and no air con...,26,465,Good location quiet if rooms are on the inter...,17,1,4.6,"[[' Business trip ', ' Solo traveler ', ' Si...",60 days,45.450867,9.194866
2,Viale Bligny 56 Milan City Center 20136 Milan ...,26,2017-05-30,5.2,Hotel Liberty,United Kingdom,The hotel was advertised with air conditionin...,64,465,Great location with public transport links on...,24,1,7.5,"[[' Leisure trip ', ' Couple ', ' Double Roo...",65 days,45.450867,9.194866
3,Viale Bligny 56 Milan City Center 20136 Milan ...,26,2017-05-29,5.2,Hotel Liberty,Ireland,There was no hot water in my room and staff w...,87,465,Tram lines,3,2,3.3,"[[' Leisure trip ', ' Group ', ' Double Room...",66 days,45.450867,9.194866
4,Viale Bligny 56 Milan City Center 20136 Milan ...,26,2017-05-25,5.2,Hotel Liberty,South Africa,NO wifi in room You have to go to portal for ...,74,465,Breakfast was good,5,4,6.3,"[[' Leisure trip ', ' Couple ', ' Double Roo...",70 days,45.450867,9.194866


In [14]:
df2.filter(" Reviewer_Nationality == 'United Kingdom' ").limit(5).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng


In [15]:
df2.filter(" Reviewer_Nationality LIKE '%United Kingdom%' ").limit(5).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",3 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-17,7.7,Hotel Arena,United Kingdom,Cleaner did not change our sheet and duvet ev...,33,1403,The room is spacious and bright The hotel is ...,18,6,4.6,"[[' Leisure trip ', ' Group ', ' Duplex Twin...",17 days,52.360577,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-17,7.7,Hotel Arena,United Kingdom,Apart from the price for the brekfast Everyth...,11,1403,Good location Set in a lovely park friendly s...,19,1,10.0,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",17 days,52.360577,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-07,7.7,Hotel Arena,United Kingdom,Nothing all great,5,1403,Rooms were stunningly decorated and really sp...,101,2,10.0,"[[' Leisure trip ', ' Group ', ' Duplex Doub...",27 days,52.360577,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-06,7.7,Hotel Arena,United Kingdom,The floor in my room was filfy dirty Very bas...,28,1403,Comfy bed good location,6,7,4.6,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",28 days,52.360577,4.915968


<h1 style="color:red;">Tip!!!!</h1>
<h3 style="color:blue;"> If you are sure filtered row are in data but filter doesn't show anything and columns to filter 
string first thing to check is spaces in string columns. </h3>

In [17]:
# Filter Average_Score lt 6.0 and Reviewer_Nationality China
df2.selectExpr("Average_Score","TRIM(Reviewer_Nationality) AS Reviewer_Nationality") \
.filter(" Average_Score < 6.0 AND Reviewer_Nationality == 'United Kingdom' ").limit(5).toPandas()

Unnamed: 0,Average_Score,Reviewer_Nationality
0,5.2,United Kingdom
1,5.2,United Kingdom
2,5.2,United Kingdom
3,5.2,United Kingdom
4,5.2,United Kingdom


In [18]:
# Filter records which includes Milan in Hotel_Address

df2.filter(F.col("Hotel_Address").rlike("Milan")).limit(5).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,6 Via Aristotile Fioravanti Garibaldi Station ...,30,2017-08-01,8.9,Hotel VIU Milan,Netherlands,There is still some construction taking place...,50,219,Very friendly staff great service from everyo...,34,21,9.6,"[[' Leisure trip ', ' Couple ', ' Superior D...",2 days,45.488766,9.183057
1,6 Via Aristotile Fioravanti Garibaldi Station ...,30,2017-07-14,8.9,Hotel VIU Milan,Russia,The hotel blocked double 1 night payment on m...,149,219,No Positive,0,5,5.4,"[[' Leisure trip ', ' Couple ', ' Superior D...",20 days,45.488766,9.183057
2,6 Via Aristotile Fioravanti Garibaldi Station ...,30,2017-07-12,8.9,Hotel VIU Milan,Belgium,Not the best neighborhood in Milan right next...,20,219,Brand new and beautifully designed hotel bar ...,13,25,8.8,"[[' Business trip ', ' Solo traveler ', ' Su...",22 days,45.488766,9.183057
3,6 Via Aristotile Fioravanti Garibaldi Station ...,30,2017-07-10,8.9,Hotel VIU Milan,Qatar,The hotel still new but the stuff is not well...,113,219,Good location near the metro station Nice des...,17,12,5.0,"[[' Leisure trip ', ' Group ', ' Superior Do...",24 days,45.488766,9.183057
4,6 Via Aristotile Fioravanti Garibaldi Station ...,30,2017-07-05,8.9,Hotel VIU Milan,United Kingdom,Having to leave,5,219,Staff were very very good Pool on the roof wa...,31,3,8.8,"[[' Leisure trip ', ' Couple ', ' Superior D...",29 days,45.488766,9.183057


In [19]:
# Filter records which includes Istanbul in Negative_Review

df2.select("Negative_Review").filter(F.col("Negative_Review").rlike("Istanbul")).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [16]:
spark.stop()