# Data Processing using Pyspark

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DataProcessingWithPySpark-EMR") \
    .getOrCreate()

spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1763434461835_0003,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7fe0525a4c50>

In [2]:
path = "s3://notebooksnath/data/airlines.csv"

# Leer CSV
df = spark.read.csv(path, header=True, inferSchema=True)

df.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+---------------+---------+--------+------+--------+-----+-----------+--------------------+
|   id|        airline|     date|location|rating|   cabin|value|recommended|              review|
+-----+---------------+---------+--------+------+--------+-----+-----------+--------------------+
|10001|Delta Air Lines|21-Jun-14|Thailand|     7| Economy|    4|        YES|Flew Mar 30 NRT t...|
|10002|Delta Air Lines|19-Jun-14|     USA|     0| Economy|    2|         NO|Flight 2463 leavi...|
|10003|Delta Air Lines|18-Jun-14|     USA|     0| Economy|    1|         NO|Delta Website fro...|
|10004|Delta Air Lines|17-Jun-14|     USA|     9|Business|    4|        YES|"I just returned ...|
|10005|Delta Air Lines|17-Jun-14| Ecuador|     7| Economy|    3|        YES|"Round-trip fligh...|
+-----+---------------+---------+--------+------+--------+-----+-----------+--------------------+
only showing top 5 rows

In [3]:
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- id: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- date: string (nullable = true)
 |-- location: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- cabin: string (nullable = true)
 |-- value: integer (nullable = true)
 |-- recommended: string (nullable = true)
 |-- review: string (nullable = true)

In [4]:
df.select("id", "airline", "rating").show(5)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+---------------+------+
|   id|        airline|rating|
+-----+---------------+------+
|10001|Delta Air Lines|     7|
|10002|Delta Air Lines|     0|
|10003|Delta Air Lines|     0|
|10004|Delta Air Lines|     9|
|10005|Delta Air Lines|     7|
+-----+---------------+------+
only showing top 5 rows

In [5]:
df.filter(df.recommended == "YES").show(5)



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+---------------+---------+--------+------+--------+-----+-----------+--------------------+
|   id|        airline|     date|location|rating|   cabin|value|recommended|              review|
+-----+---------------+---------+--------+------+--------+-----+-----------+--------------------+
|10001|Delta Air Lines|21-Jun-14|Thailand|     7| Economy|    4|        YES|Flew Mar 30 NRT t...|
|10004|Delta Air Lines|17-Jun-14|     USA|     9|Business|    4|        YES|"I just returned ...|
|10005|Delta Air Lines|17-Jun-14| Ecuador|     7| Economy|    3|        YES|"Round-trip fligh...|
|10006|Delta Air Lines|17-Jun-14|     USA|     9|Business|    5|        YES|Narita - Bangkok ...|
|10010|Delta Air Lines|13-Jun-14|      UK|     9| Economy|    3|        YES|"I flew from Heat...|
+-----+---------------+---------+--------+------+--------+-----+-----------+--------------------+
only showing top 5 rows

In [6]:
df.filter(df.cabin == "Economy").show(5)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+---------------+---------+--------+------+-------+-----+-----------+--------------------+
|   id|        airline|     date|location|rating|  cabin|value|recommended|              review|
+-----+---------------+---------+--------+------+-------+-----+-----------+--------------------+
|10001|Delta Air Lines|21-Jun-14|Thailand|     7|Economy|    4|        YES|Flew Mar 30 NRT t...|
|10002|Delta Air Lines|19-Jun-14|     USA|     0|Economy|    2|         NO|Flight 2463 leavi...|
|10003|Delta Air Lines|18-Jun-14|     USA|     0|Economy|    1|         NO|Delta Website fro...|
|10005|Delta Air Lines|17-Jun-14| Ecuador|     7|Economy|    3|        YES|"Round-trip fligh...|
|10007|Delta Air Lines|14-Jun-14|      UK|     0|Economy|    1|         NO|Flight from NY La...|
+-----+---------------+---------+--------+------+-------+-----+-----------+--------------------+
only showing top 5 rows

In [7]:
df.filter((df.recommended == "YES") & (df.rating >= 7)).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+---------------+---------+-----------+------+-----------+-----+-----------+--------------------+
|   id|        airline|     date|   location|rating|      cabin|value|recommended|              review|
+-----+---------------+---------+-----------+------+-----------+-----+-----------+--------------------+
|10001|Delta Air Lines|21-Jun-14|   Thailand|     7|    Economy|    4|        YES|Flew Mar 30 NRT t...|
|10004|Delta Air Lines|17-Jun-14|        USA|     9|   Business|    4|        YES|"I just returned ...|
|10005|Delta Air Lines|17-Jun-14|    Ecuador|     7|    Economy|    3|        YES|"Round-trip fligh...|
|10006|Delta Air Lines|17-Jun-14|        USA|     9|   Business|    5|        YES|Narita - Bangkok ...|
|10010|Delta Air Lines|13-Jun-14|         UK|     9|    Economy|    3|        YES|"I flew from Heat...|
|10011|Delta Air Lines|11-Jun-14|        USA|    10|    Economy|    4|        YES|I was a bit stubb...|
|10012|Delta Air Lines|10-Jun-14|  Australia|    10|    Economy|

In [8]:
df.filter((df.location == "USA") & (df.rating <= 3)).show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+---------------+---------+--------+------+-----------+-----+-----------+--------------------+
|   id|        airline|     date|location|rating|      cabin|value|recommended|              review|
+-----+---------------+---------+--------+------+-----------+-----+-----------+--------------------+
|10002|Delta Air Lines|19-Jun-14|     USA|     0|    Economy|    2|         NO|Flight 2463 leavi...|
|10003|Delta Air Lines|18-Jun-14|     USA|     0|    Economy|    1|         NO|Delta Website fro...|
|10008|Delta Air Lines|14-Jun-14|     USA|     0|    Economy|    1|         NO|Originally I had ...|
|10013|Delta Air Lines| 9-Jun-14|     USA|     0|    Economy|    1|         NO|My wife and I fly...|
|10015|Delta Air Lines| 6-Jun-14|     USA|     0|    Economy|    2|         NO|Our flight from F...|
|10016|Delta Air Lines| 5-Jun-14|     USA|     0|    Economy|    1|         NO|On May 22 after a...|
|10024|Delta Air Lines|20-May-14|     USA|     2|    Economy|    1|         NO|"Boy are we 

In [9]:
from pyspark.sql.functions import when

df = df.withColumn(
    "recommended_flag",
    when(df.recommended == "YES", 1).otherwise(0)
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
df = df.withColumn(
    "rating_category",
    when(df.rating >= 8, "Excellent")
    .when(df.rating >= 5, "Good")
    .when(df.rating >= 3, "Poor")
    .otherwise("Bad")
)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
from pyspark.sql.functions import length

df = df.withColumn("review_length", length(df.review))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
df.groupBy("airline").avg("rating").show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+-----------------+
|           airline|      avg(rating)|
+------------------+-----------------+
|   United Airlines|3.520361990950226|
|   Delta Air Lines|5.161943319838056|
| American Airlines|6.581081081081081|
|        US Airways|3.152263374485597|
|Southwest Airlines| 5.93719806763285|
+------------------+-----------------+

In [13]:
df.orderBy(df.review_length.desc()).show(10, truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+------------------+---------+---------+------+--------+-----+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
df.filter(df.rating <= 3).select("airline", "location", "rating", "review").show(10, truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+--------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

negative_words = ["delay", "late", "worst", "bad", "terrible", "cancel"]

def detect_sentiment_py(text):
    if text is None:
        return "Neutral/Positive"
    text_low = text.lower()
    return "Negative" if any(w in text_low for w in negative_words) else "Neutral/Positive"

detect_sentiment = udf(detect_sentiment_py, StringType())


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
df = df.withColumn("sentiment", detect_sentiment(df["review"]))



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
df.coalesce(1).write.csv("s3://notebooksnath/data/final_result_single",
                         header=True,
                         mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…