In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
# config memory to the session, incase needs the space for training
spark = SparkSession.builder.appName("NYC Yellow Taxi Analysis")\
                            .config("spark.driver.memory", "12g")\
                            .getOrCreate()

# Read files to data

In [3]:
# Load the dataset in .parquet format
import os

path = 'raw_data'
parquet_file_name = os.listdir(path)

raw_data = []

for file_name in parquet_file_name:
    parquet_path = os.path.join(path, file_name)
    raw_data.append(spark.read.parquet(parquet_path))

zone_table_path = 'taxi_zone_lookup.csv'
zone_df = spark.read.option('header', 'true').csv(zone_table_path)

In [4]:
# change the datatype for January data to make sure the consistent
from pyspark.sql.types import IntegerType, DoubleType, LongType

raw_data[0] = raw_data[0].withColumn('VendorID', raw_data[0]['VendorID'].cast(IntegerType()))\
                        .withColumn('passenger_count', raw_data[0]['passenger_count'].cast(LongType()))\
                        .withColumn('RatecodeID', raw_data[0]['RatecodeID'].cast(LongType()))\
                        .withColumn('PULocationID', raw_data[0]['PULocationID'].cast(IntegerType()))\
                        .withColumn('DOLocationID', raw_data[0]['DOLocationID'].cast(IntegerType()))

# change the zone to integer
zone_df = zone_df.withColumn('LocationID', zone_df['LocationID'].cast(IntegerType()))

In [5]:
# union the raw data into one df
df = raw_data[0]

for i in range(1, len(raw_data)):
    df = df.union(raw_data[i])

# MLLib (WIP)

## predict the total amount

In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [7]:
from pyspark.sql.functions import hour, minute, second

preprocess_time = df.withColumn("PU hour", hour("tpep_pickup_datetime")) \
                    .withColumn("PU minute", minute("tpep_pickup_datetime")) \
                    .withColumn("DO hour", hour("tpep_dropoff_datetime")) \
                    .withColumn("DO minute", minute("tpep_dropoff_datetime"))

In [8]:
# extract the necessary column data
selected_column = preprocess_time.select("PU hour", "PU minute","DO hour", "DO minute",
                            "trip_distance", "passenger_count", "PULocationID",
                            "DOLocationID", "tolls_amount", "total_amount")

# dropna values if any
selected_column = selected_column.dropna()

In [9]:
# create feature vector
feature_columns = ["PU hour", "PU minute","DO hour", "DO minute",
                            "trip_distance", "passenger_count", "PULocationID", 
                            "DOLocationID", "tolls_amount"]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

transform_df = assembler.transform(selected_column)

In [10]:
# split the dataset into training and testing set by ratio
train_df, test_df = transform_df.randomSplit([0.8, 0.2], seed=22)

In [11]:
# create the linear regression model
linear = LinearRegression(featuresCol='features', labelCol='total_amount')

In [12]:
# train the model
linear_model = linear.fit(train_df)

In [13]:
# test prediction
predictions = linear_model.transform(test_df)

In [14]:
predictions.select("PU hour", "PU minute","DO hour", "DO minute",
                            "trip_distance", "passenger_count", "PULocationID", 
                            "DOLocationID", "tolls_amount",
                            "total_amount", "prediction").orderBy(col('trip_distance').desc())
predictions.show()

+-------+---------+-------+---------+-------------+---------------+------------+------------+------------+------------+--------------------+------------------+
|PU hour|PU minute|DO hour|DO minute|trip_distance|passenger_count|PULocationID|DOLocationID|tolls_amount|total_amount|            features|        prediction|
+-------+---------+-------+---------+-------------+---------------+------------+------------+------------+------------+--------------------+------------------+
|      0|        0|      0|        0|          0.0|              1|          97|          97|         0.0|        36.0|(9,[5,6,7],[1.0,9...|24.568276416418758|
|      0|        0|      0|        0|          0.0|              1|         107|         107|         0.0|        16.2|(9,[5,6,7],[1.0,1...|24.247984578267165|
|      0|        0|      0|        0|          0.0|              1|         234|         264|         0.0|        6.88|(9,[5,6,7],[1.0,2...|  19.8341059268232|
|      0|        0|      0|        1|   

In [15]:
training_summary = linear_model.summary
print("RMSE: %f" % training_summary.rootMeanSquaredError)
print("R2: %f" % training_summary.r2)

RMSE: 46.969226
R2: 0.111854
