# BIG DATA PROJECT - HADOOP HEROES

In [1]:
from pyspark.sql import SparkSession
import pandas as pd

In [2]:

spark = SparkSession.builder \
        .master("local[*]") \
        .appName("ISM6562 Spark Project") \
        .enableHiveSupport() \
        .getOrCreate()

# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext  
sc.setLogLevel("ERROR") # only display errors (not warnings)

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

# It's best if you find that the port number displayed below is not 4040, then you should shut down all other spark sessions and 
# run this code again. If you don't, you may have trouble accessing the data in the spark-warehouse directory.

23/11/05 19:03:11 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 10.21.10.196 instead (on interface eth0)
23/11/05 19:03:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/05 19:03:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session WebUI Port: 4040


In [3]:
spark

In [4]:
spark.catalog.listTables()

[]

In [19]:
df=spark.sql("show databases")
df.show()

+---------+
|namespace|
+---------+
|  default|
|   w10_db|
+---------+



In [20]:
tables = spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [21]:
#Load data to warehouse

In [22]:
trip = spark.read.csv('data/yellow_tripdata_2022-02.csv', header=True, inferSchema=True);

# display the first 5 rows of the dataframe
trip.show(5);

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+
|       1|    02-01-2022 00:06|     02-01-2022 00:19|              1|          5.4|         138|         252|       17.0| 1.75|       3.9|       23.45|       1.25|
|       1|    02-01-2022 00:38|     02-01-2022 00:55|              1|          6.4|         138|          41|       21.0| 1.75|       0.0|        30.1|       1.25|
|       1|    02-01-2022 00:03|     02-01-2022 00:26|              1|         12.5|         138|         200|       35.5| 1.75|       0.0|        44.6|       1.25|
|       2|    02

In [24]:
from pyspark.sql import functions as F

# Assuming your DataFrame is named 'df', and you want to create a new column 'TIP_STATUS'
trip = trip.withColumn("TIP_STATUS", F.when(trip["tip_amount"] > 0, 1).otherwise(0))


In [25]:
tables = spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [26]:
type(trip)

pyspark.sql.dataframe.DataFrame

In [27]:
#Save table in spark data warehouse

In [28]:
spark.sql("CREATE DATABASE IF NOT EXISTS w10_db;")

DataFrame[]

In [29]:
trip.write.mode("overwrite").saveAsTable("w10_db.trip")

                                                                                

In [30]:
spark.catalog.listTables('w10_db')

[Table(name='boston', catalog='spark_catalog', namespace=['w10_db'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='trip', catalog='spark_catalog', namespace=['w10_db'], description=None, tableType='MANAGED', isTemporary=False)]

In [31]:
df = spark.sql("SELECT * FROM w10_db.trip")
df.show()

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|TIP_STATUS|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+
|       2|    02-01-2022 20:44|     02-01-2022 21:08|              1|         5.74|         107|           7|       20.5|  0.5|      4.86|       29.16|        0.0|         1|
|       1|    02-01-2022 20:35|     02-01-2022 20:44|              1|          1.3|         230|         229|        7.0|  3.0|       0.0|        10.8|        0.0|         0|
|       2|    02-01-2022 20:11|     02-01-2022 20:33|              1|         4.37|          79|         236|       18.0|  0.

In [32]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- TIP_STATUS: integer (nullable = true)



# Model Training

In [33]:
train_data,test_data=df.randomSplit([0.7,0.3])

In [34]:
from pyspark.ml.feature import StringIndexer
# Use StringIndexer to convert the categorical columns to hold numerical data
 
tpep_pickup_datetime_indexer = StringIndexer(inputCol='tpep_pickup_datetime',outputCol='tpep_pickup_datetime_index',handleInvalid='keep')
tpep_dropoff_datetime_indexer = StringIndexer(inputCol='tpep_dropoff_datetime',outputCol='tpep_dropoff_datetime_index',handleInvalid='keep')


In [35]:
df.columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'PULocationID',
 'DOLocationID',
 'fare_amount',
 'extra',
 'tip_amount',
 'total_amount',
 'airport_fee',
 'TIP_STATUS']

In [36]:
from pyspark.ml.feature import VectorAssembler
# Vector assembler is used to create a vector of input features
 
assembler = VectorAssembler(
    inputCols=[
        'passenger_count',
        'trip_distance',
        'airport_fee',
        'PULocationID',
        'DOLocationID',
        'tpep_dropoff_datetime_index',
        'tpep_pickup_datetime_index'
    ],
    outputCol="features"
)

In [37]:
from pyspark.ml import Pipeline

# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data
# in the same way as that of the train data
# https://spark.apache.org/docs/latest/ml-pipeline.html
 
pipe = Pipeline(stages=[
    tpep_dropoff_datetime_indexer,
    tpep_pickup_datetime_indexer,
    assembler
    ]
)

In [38]:
fitted_pipe=pipe.fit(train_data)

                                                                                

In [39]:
train_data=fitted_pipe.transform(train_data)
train_data.show(5)

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|TIP_STATUS|tpep_dropoff_datetime_index|tpep_pickup_datetime_index|            features|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+
|       1|    02-01-2022 20:00|     02-01-2022 20:03|              1|          0.7|         142|          48|        5.0|  3.5|       0.0|         9.3|        0.0|         0|                     7009.0|                    5668.0|[1.0,0.7,0.0,142..

In [40]:
test_data=fitted_pipe.transform(test_data)
test_data.show(5)

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|TIP_STATUS|tpep_dropoff_datetime_index|tpep_pickup_datetime_index|            features|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+
|       1|    02-01-2022 20:00|     02-01-2022 20:05|              1|          0.9|         162|         137|        6.0|  3.0|      1.95|       11.75|        0.0|         1|                     3451.0|                    5668.0|[1.0,0.9,0.0,162..

In [41]:
# For those interested in utilizing the ML/AI power of Tensorflow with Spark....
# https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-distributor

# In this course, we'll use the SparkML (admitedely, it's not as powerful as Tensorflow, but 
# it's easy to use and demonstrate ML on a Spark Cluster)

from pyspark.ml.regression import LinearRegression

lr_model = LinearRegression(labelCol='fare_amount')
fit_model = lr_model.fit(train_data.select(['features','fare_amount']))


                                                                                

In [42]:
results = fit_model.transform(test_data)
results.show()

+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|extra|tip_amount|total_amount|airport_fee|TIP_STATUS|tpep_dropoff_datetime_index|tpep_pickup_datetime_index|            features|        prediction|
+--------+--------------------+---------------------+---------------+-------------+------------+------------+-----------+-----+----------+------------+-----------+----------+---------------------------+--------------------------+--------------------+------------------+
|       1|    02-01-2022 20:00|     02-01-2022 20:05|              1|          0.9|         162|         137|        6.0|  3.0|      1.95|       11.75|        0.0|         1|                

In [43]:
results.select(['fare_amount','prediction']).show()

+-----------+------------------+
|fare_amount|        prediction|
+-----------+------------------+
|        6.0| 7.399546488862595|
|        6.5| 7.787183912796198|
|        7.0| 7.724357224644789|
|        8.0| 8.834481439749123|
|        7.0|  7.62814232598533|
|        7.5| 8.190035101192478|
|        9.0| 10.38961308849689|
|       10.0|10.792362030688146|
|       18.5| 18.43530535371559|
|       17.0|14.270334539871596|
|        5.5|  6.72933649995036|
|        7.0| 7.588554305811612|
|       10.5|11.782528472527638|
|        8.0|  7.85291917340197|
|       10.5|  12.2712340695602|
|       10.0|10.351415122760704|
|       13.0|13.016233178473795|
|       14.5|15.280469486605552|
|       52.0| 56.07280295696022|
|       17.5|15.997323740409612|
+-----------+------------------+
only showing top 20 rows



In [44]:
#Evaluate performance

In [45]:
test_results = fit_model.evaluate(test_data)

                                                                                

In [46]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| -1.3995464888625948|
| -1.2871839127961984|
| -0.7243572246447894|
| -0.8344814397491227|
| -0.6281423259853298|
| -0.6900351011924784|
| -1.3896130884968905|
|  -0.792362030688146|
| 0.06469464628440846|
|  2.7296654601284036|
| -1.2293364999503602|
| -0.5885543058116118|
| -1.2825284725276376|
|  0.1470808265980299|
| -1.7712340695601991|
| -0.3514151227607041|
|-0.01623317847379...|
| -0.7804694866055524|
|  -4.072802956960217|
|  1.5026762595903875|
+--------------------+
only showing top 20 rows



In [47]:
print(f"{'RMSE:':7s} {test_results.rootMeanSquaredError:>7.3f}")
print(f"{'Ex Var:':7s} {test_results.explainedVariance:>7.3f}")
print(f"{'MAE:':7s} {test_results.meanAbsoluteError:>7.3f}")
print(f"{'MSE:':7s} {test_results.meanSquaredError:>7.3f}")
print(f"{'RMSE:':7s} {test_results.rootMeanSquaredError:>7.3f}")
print(f"{'R2:':7s} {test_results.r2:>7.3f}")

RMSE:     6.074
Ex Var:  98.854
MAE:      2.119
MSE:     36.899
RMSE:     6.074
R2:       0.729


# Logistic Regression

Whether a taxi trip results in a tip or not. Here's a modified version of your code for logistic regression:

In [48]:
from pyspark.ml.classification import LogisticRegression

In [49]:
trip.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- TIP_STATUS: integer (nullable = false)



In [61]:
# Define categorical and numeric columns
categorical_columns = ['VendorID', 'PULocationID', 'DOLocationID']
numeric_columns = ['passenger_count', 'trip_distance', 'extra', 'airport_fee']

In [62]:
# Use StringIndexer to convert the categorical columns to hold numerical data
VendorID_indexer=StringIndexer(inputCol='VendorID', outputCol='VendorID_index',handleInvalid='keep')
PULocationID_indexer=StringIndexer(inputCol='PULocationID', outputCol='PULocationID_index',handleInvalid='keep')
DOLocationID_indexer=StringIndexer(inputCol='DOLocationID', outputCol='DOLocationID_index',handleInvalid='keep')

In [63]:
from pyspark.ml.feature import OneHotEncoder

In [64]:
data_encoder = OneHotEncoder(
    inputCols=[
        'VendorID_index',
        'PULocationID_index',
        'DOLocationID_index'
    ], 
    outputCols= [
        'VendorID_vec',
        'PULocationID_vec',
        'DOLocationID_vec'],
    handleInvalid='keep'
)

In [65]:
assembler = VectorAssembler(
    inputCols=[
        'VendorID_vec',
        'PULocationID_vec',
        'DOLocationID_vec'
        ],
    outputCol="features_log"
)

In [70]:
lr_model=LogisticRegression(labelCol='TIP_STATUS')

In [71]:
pipe = Pipeline(
    stages=[
        VendorID_indexer,
    PULocationID_indexer,
    DOLocationID_indexer,
        data_encoder,
        assembler,
        lr_model
    ]
)
  

In [72]:
# run the pipeline
fit_model=pipe.fit(train_data)

# Store the results in a dataframe
results_log = fit_model.transform(test_data)

                                                                                

In [97]:
results_log.select(['TIP_STATUS','prediction']).show()

+----------+----------+
|TIP_STATUS|prediction|
+----------+----------+
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         0|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
+----------+----------+
only showing top 20 rows



# Model Evaluation

In [75]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

AUC_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='TIP_STATUS',metricName='areaUnderROC')

AUC = AUC_evaluator.evaluate(results)

                                                                                

In [76]:
print("The area under the curve is {}".format(AUC))

The area under the curve is 0.5357832354964459


In [77]:
PR_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='TIP_STATUS',metricName='areaUnderPR')
PR = PR_evaluator.evaluate(results)

                                                                                

In [90]:
print("The area under the PR curve is {}".format(PR))

The area under the PR curve is 0.7740905851574635


## Accuracy

In [99]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

ACC_evaluator = MulticlassClassificationEvaluator(  #  Multiclass or Binary, the accuracy is calculated in the same way.
    labelCol="TIP_STATUS", predictionCol="prediction", metricName="accuracy")

accuracy = ACC_evaluator.evaluate(results)

In [100]:
print("The accuracy of the model is {}".format(accuracy))

The accuracy of the model is 0.0


In [93]:

from sklearn.metrics import confusion_matrix

In [94]:
y_true = results.select("TIP_STATUS")
y_true = y_true.toPandas()
 
y_pred = results.select("prediction")
y_pred = y_pred.toPandas()
 
cnf_matrix = confusion_matrix(y_true, y_pred)

ValueError: Classification metrics can't handle a mix of binary and continuous targets