In [48]:
from pyspark.sql import SparkSession

# Create a Spark session instance
spark = SparkSession.builder.appName("SparkApp").getOrCreate()

In [49]:
# Task 1

# Read CSV
csv_path = "gs://dataproc-staging-us-central1-591640780369-xk68flkr/data/2019-01-h1_cut (1).csv"

# LRead csv file into Spark dataframe
df = spark.read.csv(csv_path, header=True, inferSchema=True)

# Select columns
columns_df = df.select("passenger_count", "pulocationid", "dolocationid", "total_amount")

# Show the first 10 rows
columns_df.show(10)


+---------------+------------+------------+------------+
|passenger_count|pulocationid|dolocationid|total_amount|
+---------------+------------+------------+------------+
|            1.0|       151.0|       239.0|        9.95|
|            1.0|       239.0|       246.0|        16.3|
|            3.0|       236.0|       236.0|         5.8|
|            5.0|       193.0|       193.0|        7.55|
|            5.0|       193.0|       193.0|       55.55|
|            5.0|       193.0|       193.0|       13.31|
|            5.0|       193.0|       193.0|       55.55|
|            1.0|       163.0|       229.0|        9.05|
|            1.0|       229.0|         7.0|        18.5|
|            2.0|       141.0|       234.0|        13.0|
+---------------+------------+------------+------------+
only showing top 10 rows



In [50]:
# Task 2

# Split the dataset for training and testing
trainDF, testDF = columns_df.randomSplit([.8,.2], seed=42)

# Print trainDF and testDF
print(f"""There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set""")



There are 268536 rows in the training set, and 67314 in the test set


                                                                                

In [51]:
# Task 3

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

# Assemble the three features into a single column
assembler = VectorAssembler(inputCols=["passenger_count", "pulocationid", "dolocationid"],outputCol="features")

# Create Decision Tree Regressor
dt = DecisionTreeRegressor(featuresCol="features",labelCol="total_amount")

In [52]:
# Task 4

from pyspark.ml import Pipeline

# Create pipeline with assembler and decision tree
pipeline = Pipeline(stages=[assembler, dt])

In [53]:
# Task 5

# Train the model
model = pipeline.fit(trainDF)

                                                                                

In [46]:
# Task 6

# Make predictions
predictions = model.transform(testDF)

# Show the first 10 predictions
predictions.select("passenger_count", "pulocationid", "dolocationid", "total_amount", "prediction").show(10)

+---------------+------------+------------+------------+------------------+
|passenger_count|pulocationid|dolocationid|total_amount|        prediction|
+---------------+------------+------------+------------+------------------+
|            0.0|         4.0|         4.0|        5.75|21.582862669245657|
|            0.0|         4.0|        90.0|        10.8| 18.70489820600742|
|            0.0|         4.0|        90.0|        15.8| 18.70489820600742|
|            0.0|         7.0|         7.0|         3.3|21.582862669245657|
|            0.0|         7.0|        48.0|       21.95| 18.70489820600742|
|            0.0|         7.0|       161.0|        15.8| 18.70489820600742|
|            0.0|         7.0|       260.0|        10.8| 18.70489820600742|
|            0.0|        13.0|        13.0|         5.8|21.582862669245657|
|            0.0|        13.0|        90.0|        16.3| 18.70489820600742|
|            0.0|        13.0|       100.0|       19.55| 18.70489820600742|
+-----------

In [47]:
# Task 7

from pyspark.ml.evaluation import RegressionEvaluator

# Initialize the evaluator for RMSE
regressionEvaluator = RegressionEvaluator(predictionCol="prediction",labelCol="total_amount",metricName="rmse")

# Calculate RMSE on the test set
rmse = regressionEvaluator.evaluate(predictions)

# Print the RMSE
print(f"RMSE value = {rmse}")




RMSE value = 67.05009053259988


                                                                                