In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os 

# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!

# the base Google Drive directory
root_dir = "/content/gdrive/MyDrive/"


# choose where you want your project files to be saved
project_folder = '2021 - MDSI/BDE'

In [None]:
def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

  # create a test file to make sure it shows up in the right place
  !touch 'new_file_in_working_directory.txt'
  print('\nYour working directory was changed to ' + root_dir + project_folder + \
        "\n\nAn empty text file was created there. You can also run !pwd to confirm the current working directory." )

create_and_set_working_directory(project_folder)


Your working directory was changed to /content/gdrive/MyDrive/2021 - MDSI/BDE

An empty text file was created there. You can also run !pwd to confirm the current working directory.


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz     


In [None]:
!tar xf spark-2.4.5-bin-hadoop2.7.tgz   

In [None]:
!pip install -q findspark

In [None]:
## Set the environment variable
import os 
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/gdrive/MyDrive/2021 - MDSI/BDE/spark-2.4.5-bin-hadoop2.7"

In [None]:
import findspark
findspark.init() 

In [None]:
from pyspark.sql import SparkSession

from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType 
from pyspark.sql.types import FloatType
from pyspark.sql.types import TimestampType 
from pyspark.sql.types import DoubleType

import pyspark.sql.functions as F

In [None]:
spark = SparkSession.builder.appName('new_york_taxi_fare_prediction').getOrCreate()

# Start Building Pipeline Here


In [92]:
df_cleaned_super = spark.read.parquet("/content/gdrive/MyDrive/2021 - MDSI/BDE/data/df_transformed_2yr_final.parquet")

In [93]:
# Extract Year and 
df_cleaned_super = df_cleaned_super.withColumn('year', F.year(F.col('pickup_datetime'))).\
                                    withColumn('month', F.month(F.col('pickup_datetime')))

In [94]:
df_cleaned_super.show(2)

+--------+-------------------+-------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------------------+------------+------------+------------+-----------+------------+------------+-----------------+-----------+------------------------+----+-----+
|vendorid|    pickup_datetime|   dropoff_datetime|store_and_fwd_flag|ratecodeid|pulocationid|dolocationid|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type|taxi_campany|pickup_hour|week_day_num|week_day_abb|trip_duration_sec|speed_km_hr|trip_duration_range_mins|year|month|
+--------+-------------------+-------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------------------+------------+------------+------------+-----------+------------+------------+-----------

In [95]:
# Select the features interested: for SQL + Model
model_cols_list = ['year', 'month','passenger_count', 'trip_distance', 'total_amount', 'pickup_hour', 'week_day_num',
             'ratecodeid', 'pulocationid', 'taxi_campany', 'trip_duration_range_mins', 'store_and_fwd_flag', 'tolls_amount' ]
df_cleaned_model = df_cleaned_super.select(model_cols_list)

In [96]:
df_cleaned_model.show(5)

+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|year|month|passenger_count|trip_distance|total_amount|pickup_hour|week_day_num|ratecodeid|pulocationid|taxi_campany|trip_duration_range_mins|store_and_fwd_flag|tolls_amount|
+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|2017|    9|              1|         1.15|        8.76|         15|           6|         1|         234|       Green|               5-10 mins|                 N|         0.0|
|2017|    9|              1|          4.0|        21.8|         15|           6|         1|         233|       Green|              20-30 mins|                 N|         0.0|
|2017|    9|              1|         6.83|       27.88|         15|           6|         1|         230|       Green|        

In [97]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoderEstimator
from pyspark.ml import Pipeline

stages = []

## Encode the Cartegorical Features
cat_cols = ['ratecodeid', 'taxi_campany', 'trip_duration_range_mins', 'store_and_fwd_flag', 'pulocationid', 'week_day_num','year', 'month']
for cat_col in cat_cols:
    col_indexer = StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}_ind")
    col_encoder = OneHotEncoderEstimator(inputCols=[f"{cat_col}_ind"], outputCols=[f"{cat_col}_ohe"])
    stages += [col_indexer, col_encoder]
cat_cols_ohe = [f"{cat_col}_ohe" for cat_col in cat_cols]

In [98]:
## Vectorize the Numerical Features 

num_cols = ['passenger_count', 'trip_distance', 'pickup_hour', 'total_amount']

assembler = VectorAssembler(inputCols=cat_cols_ohe + num_cols, outputCol="features")
assembler.setHandleInvalid("keep")

VectorAssembler_73efc3742f4a

In [99]:
stages += [assembler]

pipeline = Pipeline(stages=stages)

In [100]:
### Supply the training Data to pipeline
pipeline_model = pipeline.fit(df_cleaned_model)

In [101]:
pipelined_cleaned_df = pipeline_model.transform(df_cleaned_model)

In [102]:
pipelined_cleaned_df.show(2)

+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+--------------+--------------+----------------+----------------+----------------------------+----------------------------+----------------------+----------------------+----------------+----------------+----------------+----------------+--------+---------+---------+--------------+--------------------+
|year|month|passenger_count|trip_distance|total_amount|pickup_hour|week_day_num|ratecodeid|pulocationid|taxi_campany|trip_duration_range_mins|store_and_fwd_flag|tolls_amount|ratecodeid_ind|ratecodeid_ohe|taxi_campany_ind|taxi_campany_ohe|trip_duration_range_mins_ind|trip_duration_range_mins_ohe|store_and_fwd_flag_ind|store_and_fwd_flag_ohe|pulocationid_ind|pulocationid_ohe|week_day_num_ind|week_day_num_ohe|year_ind| year_ohe|month_ind|     month_ohe|            features|
+----+-----+---------------+-------------+------

In [103]:
pipelined_cleaned_df.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- week_day_num: integer (nullable = true)
 |-- ratecodeid: string (nullable = true)
 |-- pulocationid: string (nullable = true)
 |-- taxi_campany: string (nullable = true)
 |-- trip_duration_range_mins: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- ratecodeid_ind: double (nullable = false)
 |-- ratecodeid_ohe: vector (nullable = true)
 |-- taxi_campany_ind: double (nullable = false)
 |-- taxi_campany_ohe: vector (nullable = true)
 |-- trip_duration_range_mins_ind: double (nullable = false)
 |-- trip_duration_range_mins_ohe: vector (nullable = true)
 |-- store_and_fwd_flag_ind: double (nullable = false)
 |-- store_and_fwd_flag_ohe: vector (nullable = tru

In [107]:
pipelined_cleaned_2yr_final_df = pipelined_cleaned_df.select(['features'] + model_cols_list)

In [108]:
pipelined_cleaned_2yr_final_df.show(2)

+--------------------+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|            features|year|month|passenger_count|trip_distance|total_amount|pickup_hour|week_day_num|ratecodeid|pulocationid|taxi_campany|trip_duration_range_mins|store_and_fwd_flag|tolls_amount|
+--------------------+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|(298,[0,6,8,11,18...|2017|    9|              1|         1.15|        8.76|         15|           6|         1|         234|       Green|               5-10 mins|                 N|         0.0|
|(298,[0,6,9,11,47...|2017|    9|              1|          4.0|        21.8|         15|           6|         1|         233|       Green|              20-30 mins|                 N|         0.0|
+-------------------

In [109]:
pipelined_cleaned_2yr_final_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- total_amount: float (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- week_day_num: integer (nullable = true)
 |-- ratecodeid: string (nullable = true)
 |-- pulocationid: string (nullable = true)
 |-- taxi_campany: string (nullable = true)
 |-- trip_duration_range_mins: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- tolls_amount: float (nullable = true)



In [None]:
# pipelined_cleaned_2yr_final_df.write.format('parquet').save("/content/gdrive/MyDrive/2021 - MDSI/BDE/data/df_pipelined_2yr_gdrive_final.parquet", mode='append')

In [115]:
pipelined_cleaned_2yr_final_df.show(2)

+--------------------+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|            features|year|month|passenger_count|trip_distance|total_amount|pickup_hour|week_day_num|ratecodeid|pulocationid|taxi_campany|trip_duration_range_mins|store_and_fwd_flag|tolls_amount|
+--------------------+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|(298,[0,6,8,11,18...|2017|    9|              1|         1.15|        8.76|         15|           6|         1|         234|       Green|               5-10 mins|                 N|         0.0|
|(298,[0,6,9,11,47...|2017|    9|              1|          4.0|        21.8|         15|           6|         1|         233|       Green|              20-30 mins|                 N|         0.0|
+-------------------

# Split the data - first 21 months for training and last 3 months for testing

In [111]:
pipelined_cleaned_3month_df = pipelined_cleaned_2yr_final_df.filter((F.col('year') == 2018) & (((F.col('month') == 10) |(F.col('month') == 11) |(F.col('month') == 12)) ))

In [112]:
pipelined_cleaned_21month_df = pipelined_cleaned_2yr_final_df.filter(((F.col('year') == 2017) & ((F.col('month') >= 1) & (F.col('month') <= 12)))|
                                                            ((F.col('year') == 2018) & ((F.col('month') >= 1) & (F.col('month') <= 9)))
                                                            )

In [113]:
pipelined_cleaned_21month_df.show(2)

+--------------------+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|            features|year|month|passenger_count|trip_distance|total_amount|pickup_hour|week_day_num|ratecodeid|pulocationid|taxi_campany|trip_duration_range_mins|store_and_fwd_flag|tolls_amount|
+--------------------+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|(298,[0,6,8,11,18...|2017|    9|              1|         1.15|        8.76|         15|           6|         1|         234|       Green|               5-10 mins|                 N|         0.0|
|(298,[0,6,9,11,47...|2017|    9|              1|          4.0|        21.8|         15|           6|         1|         233|       Green|              20-30 mins|                 N|         0.0|
+-------------------

In [114]:
pipelined_cleaned_3month_df.show(2)

+--------------------+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|            features|year|month|passenger_count|trip_distance|total_amount|pickup_hour|week_day_num|ratecodeid|pulocationid|taxi_campany|trip_duration_range_mins|store_and_fwd_flag|tolls_amount|
+--------------------+----+-----+---------------+-------------+------------+-----------+------------+----------+------------+------------+------------------------+------------------+------------+
|(298,[0,7,11,77,2...|2018|   11|              6|         2.36|        12.8|          1|           4|         1|         223|      Yellow|              10-20 mins|                 N|         0.0|
|(298,[0,8,11,54,2...|2018|   11|              6|         1.38|         7.8|          4|           4|         1|           7|      Yellow|               5-10 mins|                 N|         0.0|
+-------------------

# Linear Regression 



In [116]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='total_amount', maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [117]:
lr_model = lr.fit(pipelined_cleaned_21month_df)

In [118]:
lr_predictions_train = lr_model.transform(pipelined_cleaned_21month_df)
lr_predictions_train.select("prediction","total_amount","features").show(20)

+------------------+------------+--------------------+
|        prediction|total_amount|            features|
+------------------+------------+--------------------+
|  9.08329902236744|        8.76|(298,[0,6,8,11,18...|
|21.428128581429874|        21.8|(298,[0,6,9,11,47...|
|27.524592744206984|       27.88|(298,[0,6,9,11,17...|
|17.293518928586572|       17.76|(298,[0,6,9,11,18...|
|15.720854920704756|       15.96|(298,[0,6,7,11,22...|
|15.079499754350337|       15.36|(298,[0,6,7,11,30...|
|18.197390130852703|        18.5|(298,[0,6,7,11,37...|
|  9.62444741352145|         9.3|(298,[0,6,8,11,60...|
| 37.65650017733958|       39.06|(298,[0,6,9,11,24...|
| 9.610622671881897|        9.36|(298,[0,6,8,11,16...|
|15.845376023500343|        15.8|(298,[0,6,7,11,14...|
|13.793631385913939|        13.8|(298,[0,6,7,11,27...|
| 21.51537875119506|       21.96|(298,[0,6,9,11,38...|
|15.230764921294728|       15.36|(298,[0,6,7,11,30...|
|14.614366723847542|       14.65|(298,[0,6,7,11,50...|
|13.830865

In [119]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="total_amount", metricName="rmse")
rmse = lr_evaluator.evaluate(lr_predictions_train)
print("Root Mean Squared Error (RMSE) on Train data = %g" % rmse)

Root Mean Squared Error (RMSE) on Train data = 0.461471


In [61]:
lr_predictions_test = lr_model.transform(pipelined_cleaned_3month_df)
lr_predictions_test.select("prediction","total_amount","features").show(20)

+------------------+------------+--------------------+
|        prediction|total_amount|            features|
+------------------+------------+--------------------+
|12.997035248889294|        12.8|(288,[0,7,11,77,2...|
| 8.278218108661902|         7.8|(288,[0,8,11,54,2...|
|15.831465154675325|        15.8|(288,[0,7,11,113,...|
|19.933035724450573|        19.8|(288,[0,7,11,92,2...|
| 44.23234117573518|        44.3|(288,[0,9,11,113,...|
|15.062966643818646|        14.8|(288,[0,7,11,92,2...|
|35.146524721841736|        35.3|(288,[0,10,11,113...|
|12.997035248889294|        12.8|(288,[0,7,11,77,2...|
| 8.278218108661902|         7.8|(288,[0,8,11,54,2...|
|15.831465154675325|        15.8|(288,[0,7,11,113,...|
|19.933035724450573|        19.8|(288,[0,7,11,92,2...|
| 44.23234117573518|        44.3|(288,[0,9,11,113,...|
|15.062966643818646|        14.8|(288,[0,7,11,92,2...|
|35.146524721841736|        35.3|(288,[0,10,11,113...|
|12.997035248889294|        12.8|(288,[0,7,11,77,2...|
| 8.278218

In [62]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="total_amount", metricName="rmse")
rmse = lr_evaluator.evaluate(lr_predictions_test)
print("Root Mean Squared Error (RMSE) on Test data = %g" % rmse)

Root Mean Squared Error (RMSE) on Test data = 0.470649
