# BIG DATA PROJECT - HADOOP HEROES

In [1]:
from pyspark.sql import SparkSession
import pandas as pd

In [2]:

spark = SparkSession.builder \
        .master("local[*]") \
        .appName("ISM6562 Spark Project") \
        .enableHiveSupport() \
        .getOrCreate()

# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext  
sc.setLogLevel("ERROR") # only display errors (not warnings)

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

# It's best if you find that the port number displayed below is not 4040, then you should shut down all other spark sessions and 
# run this code again. If you don't, you may have trouble accessing the data in the spark-warehouse directory.

23/11/05 20:26:46 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 10.21.10.196 instead (on interface eth0)
23/11/05 20:26:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/05 20:26:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session WebUI Port: 4040


In [3]:
spark

In [4]:
spark.catalog.listTables()

[]

In [5]:
df=spark.sql("show databases")
df.show()

+---------+
|namespace|
+---------+
|  default|
|   w10_db|
+---------+



In [6]:
tables = spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [7]:
#Load data to warehouse

In [8]:
trip = spark.read.csv('data/yellow_tripdata_2022-02.csv', header=True, inferSchema=True);

# display the first 5 rows of the dataframe
trip.show(5);

                                                                                

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+
|       1|    02-01-2022 00:06|     02-01-2022 00:19|              1|          5.4|         138|         252|       17.0| 1.75|       3.9|       23.45|       1.25|
|       1|    02-01-2022 00:38|     02-01-2022 00:55|              1|          6.4|         138|          41|       21.0| 1.75|       0.0|        30.1|       1.25|
|       1|    02-01-2022 00:03|     02-01-2022 00:26|              1|         12.5|         138|         200|       35.5| 1.75|       0.0|        44.6|       1.25|
|       2|    02

In [9]:
from pyspark.sql import functions as F

# Assuming your DataFrame is named 'df', and you want to create a new column 'TIP_STATUS'
trip = trip.withColumn("TIP_STATUS", F.when(trip["tip_amount"] > 0, 1).otherwise(0))


In [10]:
tables = spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [11]:
type(trip)

pyspark.sql.dataframe.DataFrame

In [12]:
#Save table in spark data warehouse

In [13]:
spark.sql("CREATE DATABASE IF NOT EXISTS w10_db;")

DataFrame[]

In [14]:
trip.write.mode("overwrite").saveAsTable("w10_db.trip")

                                                                                

In [15]:
spark.catalog.listTables('w10_db')

[Table(name='boston', catalog='spark_catalog', namespace=['w10_db'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='trip', catalog='spark_catalog', namespace=['w10_db'], description=None, tableType='MANAGED', isTemporary=False)]

In [16]:
df = spark.sql("SELECT * FROM w10_db.trip")
df.show()

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|TIP_STATUS|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+
|       2|    02-01-2022 20:44|     02-01-2022 21:08|              1|         5.74|         107|           7|       20.5|  0.5|      4.86|       29.16|        0.0|         1|
|       1|    02-01-2022 20:35|     02-01-2022 20:44|              1|          1.3|         230|         229|        7.0|  3.0|       0.0|        10.8|        0.0|         0|
|       2|    02-01-2022 20:11|     02-01-2022 20:33|              1|         4.37|          79|         236|       18.0|  0.

In [17]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- TIP_STATUS: integer (nullable = true)



# Model Training

In [18]:
train_data,test_data=df.randomSplit([0.7,0.3])

In [19]:
from pyspark.ml.feature import StringIndexer
# Use StringIndexer to convert the categorical columns to hold numerical data
 
tpep_pickup_datetime_indexer = StringIndexer(inputCol='tpep_pickup_datetime',outputCol='tpep_pickup_datetime_index',handleInvalid='keep')
tpep_dropoff_datetime_indexer = StringIndexer(inputCol='tpep_dropoff_datetime',outputCol='tpep_dropoff_datetime_index',handleInvalid='keep')


In [20]:
df.columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'PULocationID',
 'DOLocationID',
 'fare_amount',
 'extra',
 'tip_amount',
 'total_amount',
 'airport_fee',
 'TIP_STATUS']

In [21]:
from pyspark.ml.feature import VectorAssembler
# Vector assembler is used to create a vector of input features
 
assembler = VectorAssembler(
    inputCols=[
        'passenger_count',
        'trip_distance',
        'airport_fee',
        'PULocationID',
        'DOLocationID',
        'tpep_dropoff_datetime_index',
        'tpep_pickup_datetime_index'
    ],
    outputCol="features"
)

In [22]:
from pyspark.ml import Pipeline

# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data
# in the same way as that of the train data
# https://spark.apache.org/docs/latest/ml-pipeline.html
 
pipe = Pipeline(stages=[
    tpep_dropoff_datetime_indexer,
    tpep_pickup_datetime_indexer,
    assembler
    ]
)

In [23]:
fitted_pipe=pipe.fit(train_data)

                                                                                

In [24]:
train_data=fitted_pipe.transform(train_data)
train_data.show(5)

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|TIP_STATUS|tpep_dropoff_datetime_index|tpep_pickup_datetime_index|            features|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+
|       1|    02-01-2022 20:00|     02-01-2022 20:03|              1|          0.9|         236|          43|        5.0|  3.5|       0.0|         9.3|        0.0|         0|                     4356.0|                    7784.0|[1.0,0.9,0.0,236..

In [25]:
test_data=fitted_pipe.transform(test_data)
test_data.show(5)

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|TIP_STATUS|tpep_dropoff_datetime_index|tpep_pickup_datetime_index|            features|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+
|       1|    02-01-2022 20:00|     02-01-2022 20:03|              1|          0.7|         142|          48|        5.0|  3.5|       0.0|         9.3|        0.0|         0|                     4356.0|                    7784.0|[1.0,0.7,0.0,142..

In [26]:
# For those interested in utilizing the ML/AI power of Tensorflow with Spark....
# https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-distributor

# In this course, we'll use the SparkML (admitedely, it's not as powerful as Tensorflow, but 
# it's easy to use and demonstrate ML on a Spark Cluster)

from pyspark.ml.regression import LinearRegression

lr_model = LinearRegression(labelCol='fare_amount')
fit_model = lr_model.fit(train_data.select(['features','fare_amount']))


                                                                                

In [27]:
results = fit_model.transform(test_data)
results.show()

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|TIP_STATUS|tpep_dropoff_datetime_index|tpep_pickup_datetime_index|            features|        prediction|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+------------------+
|       1|    02-01-2022 20:00|     02-01-2022 20:03|              1|          0.7|         142|          48|        5.0|  3.5|       0.0|         9.3|        0.0|         0|                

In [28]:
results.select(['fare_amount','prediction']).show()

+-----------+------------------+
|fare_amount|        prediction|
+-----------+------------------+
|        5.0| 6.829871118952641|
|        5.0| 7.239666603704055|
|        6.0| 7.260228843901214|
|        7.0| 7.688220328587425|
|        8.0|  8.82577461961405|
|        7.0| 7.609477342529199|
|       10.0|10.132867685776771|
|       17.0|14.337578003269218|
|        6.5| 8.624097531893051|
|       14.5| 15.50217781123574|
|       31.0| 32.07035874312297|
|       28.5|19.215606033657373|
|        6.0| 7.547949024736084|
|        5.5| 7.598793767185676|
|        8.0| 9.530166713230312|
|        3.5| 6.336422743804173|
|        4.0| 6.258239945050619|
|        6.5| 8.365000493612953|
|        7.5| 8.119363197177716|
|       16.0| 18.97216147409055|
+-----------+------------------+
only showing top 20 rows



In [29]:
#Evaluate performance

In [30]:
test_results = fit_model.evaluate(test_data)

                                                                                

In [31]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| -1.8298711189526413|
| -2.2396666037040553|
| -1.2602288439012144|
| -0.6882203285874251|
| -0.8257746196140499|
| -0.6094773425291988|
|-0.13286768577677144|
|  2.6624219967307816|
| -2.1240975318930513|
| -1.0021778112357396|
| -1.0703587431229735|
|   9.284393966342627|
| -1.5479490247360843|
| -2.0987937671856756|
| -1.5301667132303116|
| -2.8364227438041727|
| -2.2582399450506188|
| -1.8650004936129534|
| -0.6193631971777158|
| -2.9721614740905515|
+--------------------+
only showing top 20 rows



In [32]:
print(f"{'RMSE:':7s} {test_results.rootMeanSquaredError:>7.3f}")
print(f"{'Ex Var:':7s} {test_results.explainedVariance:>7.3f}")
print(f"{'MAE:':7s} {test_results.meanAbsoluteError:>7.3f}")
print(f"{'MSE:':7s} {test_results.meanSquaredError:>7.3f}")
print(f"{'RMSE:':7s} {test_results.rootMeanSquaredError:>7.3f}")
print(f"{'R2:':7s} {test_results.r2:>7.3f}")

RMSE:     6.091
Ex Var:  98.249
MAE:      2.116
MSE:     37.100
RMSE:     6.091
R2:       0.729


# Logistic Regression

Whether a taxi trip results in a tip or not. Here's a modified version of your code for logistic regression:

# DECISION TREE

In [33]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml import Pipeline


In [34]:
# Use StringIndexer to convert the categorical columns to hold numerical data
VendorID_indexer=StringIndexer(inputCol='VendorID', outputCol='VendorID_index',handleInvalid='keep')
PULocationID_indexer=StringIndexer(inputCol='PULocationID', outputCol='PULocationID_index',handleInvalid='keep')
DOLocationID_indexer=StringIndexer(inputCol='DOLocationID', outputCol='DOLocationID_index',handleInvalid='keep')

In [35]:
assembler = VectorAssembler(
    inputCols=[
        'VendorID_index',
        'PULocationID_index',
        'DOLocationID_index',
        'passenger_count', 'trip_distance', 'extra', 'airport_fee'
    ],
    outputCol="features_dtree"
)

In [36]:
dt_model = DecisionTreeClassifier(labelCol='TIP_STATUS',maxBins=16000)

In [37]:
pipe = Pipeline(
    stages=[
        VendorID_indexer,
        PULocationID_indexer,
        DOLocationID_indexer,
        assembler,
        dt_model
    ]
)

In [38]:
fit_model=pipe.fit(train_data)

                                                                                

In [39]:
results = fit_model.transform(test_data)

In [40]:
results.select(['TIP_STATUS','prediction']).show()

+----------+----------+
|TIP_STATUS|prediction|
+----------+----------+
|         0|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         0|       1.0|
|         1|       1.0|
|         1|       1.0|
|         0|       1.0|
|         1|       1.0|
|         0|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
+----------+----------+
only showing top 20 rows



In [41]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [42]:
ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="TIP_STATUS", predictionCol="prediction", metricName="accuracy")

accuracy = ACC_evaluator.evaluate(results)

print(f"The accuracy of the decision tree classifier is {accuracy}")



The accuracy of the decision tree classifier is 0.7700212943245995


                                                                                