# Start Hadoop-YARN

In [1]:
# We have to start Hadoop and YARN if we want to use Spark in YARN cluster mode

In [2]:
# ! start-all.sh

In [3]:
import findspark
findspark.init("/opt/manual/spark/")

In [4]:
from pyspark.sql import SparkSession, functions as F

In [5]:
# Beware the master is yarn and enableHiveSupport()

spark = SparkSession.builder \
.appName("Higher Order Functions") \
.master("yarn") \
.enableHiveSupport() \
.getOrCreate()

# Read from HDFS

In [6]:
# ! hdfs dfs -ls /user/train/datasets

In [7]:
from pyspark.sql.types import *

programmatical_schema = StructType([
        StructField("Hotel_Address",StringType(),True),
        StructField("Additional_Number_of_Scoring",IntegerType(),True),
        StructField("Review_Date",StringType(),True),
        StructField("Average_Score",FloatType(),True),
        StructField("Hotel_Name",StringType(),True),
        StructField("Reviewer_Nationality",StringType(),True),
        StructField("Negative_Review",StringType(),True),
        StructField("Review_Total_Negative_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews",IntegerType(),True),
        StructField("Positive_Review",StringType(),True),
        StructField("Review_Total_Positive_Word_Counts",IntegerType(),True),
        StructField("Total_Number_of_Reviews_Reviewer_Has_Given",IntegerType(),True),
        StructField("Reviewer_Score",FloatType(),True),
        StructField("Tags",StringType(),True),
        StructField("days_since_review",StringType(),True),
        StructField("lat",FloatType(),True),
        StructField("lng",FloatType(),True)
    ])

# StructField("Tags",ArrayType(StringType()),True)
# Actually Tags should be array but csv cannot store array type.
# So you have to define it as StringType 

# Review_Date is still StringType() and should be DateType() 
# But for the moment we intentioally leave it StringType()
# As soon as we put schema on data we will modify it.

In [8]:
# ! hdfs dfs -put /home/train/datasets/Hotel_Reviews.csv.gz /user/train/datasets

In [9]:
df = spark.read.format("csv") \
.schema(programmatical_schema) \
.option("header", True) \
.option("sep",",") \
.option("compression","gzip") \
.load("hdfs://localhost:9000/user/train/datasets/Hotel_Reviews.csv.gz")

In [10]:
# Now we can correct Tags datatype
# But we have to do additional preperation before cast with split.
# And cast Review_Date to date
df2 = df.withColumn("Tags", 
                     F.split(F.col("Tags"), ",")
                     .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [11]:
df2.limit(2).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.360577,4.915968


In [12]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: float (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: float (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lng: float (nullable = true)



# How to access and process array datatypes?

## Method-1: explode()

In [13]:
df2.count()

515738

In [14]:
df3 = df2.withColumn("exp_tags", F.explode("Tags")) \
.withColumn("clean_tags", F.regexp_replace("exp_tags","\[' |'|]","")) \
.withColumn("clean_tags", F.trim("clean_tags"))

In [15]:
df3.count()

2356592

In [16]:
df3.select("Tags","exp_tags","clean_tags").limit(10).toPandas()

Unnamed: 0,Tags,exp_tags,clean_tags
0,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",[' Leisure trip ',Leisure trip
1,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",' Couple ',Couple
2,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",' Duplex Double Room ',Duplex Double Room
3,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",' Stayed 6 nights '],Stayed 6 nights
4,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",[' Leisure trip ',Leisure trip
5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",' Couple ',Couple
6,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",' Duplex Double Room ',Duplex Double Room
7,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",' Stayed 4 nights '],Stayed 4 nights
8,"[[' Leisure trip ', ' Family with young child...",[' Leisure trip ',Leisure trip
9,"[[' Leisure trip ', ' Family with young child...",' Family with young children ',Family with young children


# Access inside of array type column

In [19]:
df2.select(F.col("Tags").getItem(2)).show(n=4, truncate=100)

+-----------------------+
|                Tags[2]|
+-----------------------+
| ' Duplex Double Room '|
| ' Duplex Double Room '|
| ' Duplex Double Room '|
| ' Duplex Double Room '|
+-----------------------+
only showing top 4 rows



In [20]:
df = spark.createDataFrame([
    ("Erkan", ['kdskjd@kjdsjk.com','mymail@yourmail.com']),
    ("Mehmet",['hismail@hermail.com', 'sugar@candy.com'])
], ['Name','Mails'])

In [21]:
df.show(truncate=False)

+------+----------------------------------------+
|Name  |Mails                                   |
+------+----------------------------------------+
|Erkan |[kdskjd@kjdsjk.com, mymail@yourmail.com]|
|Mehmet|[hismail@hermail.com, sugar@candy.com]  |
+------+----------------------------------------+



In [22]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Mails: array (nullable = true)
 |    |-- element: string (containsNull = true)



# filter()

In [23]:
df.withColumn("SugarCandy", F.array_contains(F.col("Mails"), 'sugar@candy.com')) \
.show(truncate=False)

+------+----------------------------------------+----------+
|Name  |Mails                                   |SugarCandy|
+------+----------------------------------------+----------+
|Erkan |[kdskjd@kjdsjk.com, mymail@yourmail.com]|false     |
|Mehmet|[hismail@hermail.com, sugar@candy.com]  |true      |
+------+----------------------------------------+----------+



In [24]:
df.filter(F.array_contains(F.col("Mails"), 'sugar@candy.com')) \
.show(truncate=False)

+------+--------------------------------------+
|Name  |Mails                                 |
+------+--------------------------------------+
|Mehmet|[hismail@hermail.com, sugar@candy.com]|
+------+--------------------------------------+



In [25]:
spark.stop()

# Stop Hadoop YARN

In [23]:
# ! stop-all.sh