In [1]:
import findspark

In [2]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F

In [4]:
spark = SparkSession.builder \
.appName("Columns and Expressions") \
.master("local[2]") \
.getOrCreate()

In [5]:
# Data source: https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe

In [6]:
#! wget -P ~/datasets \
#https://github.com/erkansirin78/datasets/raw/master/Hotel_Reviews.csv.gz

In [7]:
! ls -l ~/datasets | grep Hotel

-rw-rw-r--. 1 train train  46401315 Oct  6 12:18 Hotel_Reviews.csv.gz


In [8]:
from pyspark.sql.types import *

programmatical_schema = StructType([
        StructField("Hotel_Address",StringType(),True),
        StructField("Additional_Number_of_Scoring",IntegerType(),True),
        StructField("Review_Date",StringType(),True),
        StructField("Average_Score",FloatType(),True),
        StructField("Hotel_Name",StringType(),True),
        StructField("Reviewer_Nationality",StringType(),True),
        StructField("Negative_Review",StringType(),True),
        StructField("Review_Total_Negative_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews",IntegerType(),True),
        StructField("Positive_Review",StringType(),True),
        StructField("Review_Total_Positive_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews_Reviewer_Has_Given",IntegerType(),True),
        StructField("Reviewer_Score",FloatType(),True),
        StructField("Tags",StringType(),True),
        StructField("days_since_review",StringType(),True),
        StructField("lat",FloatType(),True),
        StructField("lng",FloatType(),True)
    ])

# StructField("Tags",ArrayType(StringType()),True)
# Actually Tags should be array but csv cannot store array type.
# So you have to define it as StringType 

# Review_Date is still StringType() and should be DateType() 
# But for the moment we intentioally leave it StringType()
# As soon as we put schema on data we will modify it.

In [9]:
df = spark.read.option("header", True) \
.schema(programmatical_schema) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [10]:
# Now we can correct Tags datatype
# But we have to do additional preperation before cast with split.
df2 = df.withColumn("Tags", 
                     F.split(F.col("Tags"), ",")
                     .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [11]:
df2.limit(5).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[[' Leisure trip ', ' Family with young child...",3 days,52.360577,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-31,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[[' Leisure trip ', ' Solo traveler ', ' Dup...",3 days,52.360577,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-07-24,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[[' Leisure trip ', ' Couple ', ' Suite ', ...",10 days,52.360577,4.915968


In [12]:
df2.columns

['Hotel_Address',
 'Additional_Number_of_Scoring',
 'Review_Date',
 'Average_Score',
 'Hotel_Name',
 'Reviewer_Nationality',
 'Negative_Review',
 'Review_Total_Negative_Word_Counts',
 'Total_Number_of_Reviews',
 'Positive_Review',
 'Review_Total_Positive_Word_Counts',
 'Total_Number_of_Reviews_Reviewer_Has_Given',
 'Reviewer_Score',
 'Tags',
 'days_since_review',
 'lat',
 'lng']

## Logical and math expressions on column

In [13]:
df2.select(F.expr(" Reviewer_Score * 10 ")).show(5)

+---------------------+
|(Reviewer_Score * 10)|
+---------------------+
|                 29.0|
|                 75.0|
|                 71.0|
|                 38.0|
|                 67.0|
+---------------------+
only showing top 5 rows



In [14]:
# We don't want space and special characters in column name
# With alias modify auto generated column name

df2.select(F.expr(" Reviewer_Score * 10 ").alias("Reviewer_Score10")).show(5)

+----------------+
|Reviewer_Score10|
+----------------+
|            29.0|
|            75.0|
|            71.0|
|            38.0|
|            67.0|
+----------------+
only showing top 5 rows



In [15]:
df2.select( (F.col("Reviewer_Score") * 12).alias("Reviewer_Score12") ).show(5)

+----------------+
|Reviewer_Score12|
+----------------+
|       34.800003|
|            90.0|
|            85.2|
|            45.6|
|       80.399994|
+----------------+
only showing top 5 rows



In [16]:
df2.select( (F.col("Reviewer_Score") > 5.0 ).alias("Reviewer_Score_GT5") ).show(5)

+------------------+
|Reviewer_Score_GT5|
+------------------+
|             false|
|              true|
|              true|
|             false|
|              true|
+------------------+
only showing top 5 rows



In [17]:
# Add a new column named Is_Positive if review is gt 5 true else false

df2.withColumn("Is_Positive", (F.expr( "Reviewer_Score > 5.0 "))) \
.select("Reviewer_Score","Is_Positive").show(5)

+--------------+-----------+
|Reviewer_Score|Is_Positive|
+--------------+-----------+
|           2.9|      false|
|           7.5|       true|
|           7.1|       true|
|           3.8|      false|
|           6.7|       true|
+--------------+-----------+
only showing top 5 rows



# Different ways to reach a column

In [14]:
df2.select(F.col("Reviewer_Nationality")).show(3)

+--------------------+
|Reviewer_Nationality|
+--------------------+
|             Russia |
|            Ireland |
|          Australia |
+--------------------+
only showing top 3 rows



In [19]:
df2.select("Reviewer_Nationality").show(3)

+--------------------+
|Reviewer_Nationality|
+--------------------+
|             Russia |
|            Ireland |
|          Australia |
+--------------------+
only showing top 3 rows



In [20]:
df2.select(df2['Reviewer_Nationality']).show(3)

+--------------------+
|Reviewer_Nationality|
+--------------------+
|             Russia |
|            Ireland |
|          Australia |
+--------------------+
only showing top 3 rows



In [21]:
df2.select(df2.Reviewer_Nationality).show(3)

+--------------------+
|Reviewer_Nationality|
+--------------------+
|             Russia |
|            Ireland |
|          Australia |
+--------------------+
only showing top 3 rows



In [15]:
spark.stop()