In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.appName("NYC_Taxi_Analysis").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/04 20:37:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = spark.read.parquet("NYC/*.parquet")

In [15]:
data.columns, len(data.columns)

(['VendorID',
  'tpep_pickup_datetime',
  'tpep_dropoff_datetime',
  'passenger_count',
  'trip_distance',
  'RatecodeID',
  'store_and_fwd_flag',
  'PULocationID',
  'DOLocationID',
  'payment_type',
  'fare_amount',
  'extra',
  'mta_tax',
  'tip_amount',
  'tolls_amount',
  'improvement_surcharge',
  'total_amount',
  'congestion_surcharge',
  'airport_fee'],
 19)

# Start spark session

In [27]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NYC_Taxi_Analysis").getOrCreate()


# Load Data

In [28]:
parquet_file_path = "./NYC/*.parquet"
taxi_df = spark.read.parquet(parquet_file_path)

# Trip Analysis

## Calculate Duration and Distance:
Create a new column for trip duration and calculate it using the difference between pickup and dropoff times. Also, calculate the average distance for each record:

In [29]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

taxi_df = taxi_df.withColumn("trip_duration", F.unix_timestamp("tpep_dropoff_datetime") - F.unix_timestamp("tpep_pickup_datetime"))

In [30]:
taxi_df = taxi_df.withColumn("avg_distance", (F.col("trip_distance") / F.col("passenger_count")))

## Extract Time of Day, Day of Week, and Month of Year:
- Extract the desired time components from the tpep_pickup_datetime column

In [31]:
taxi_df = taxi_df.withColumn("pickup_hour", F.hour("tpep_pickup_datetime"))

In [32]:
taxi_df = taxi_df.withColumn("pickup_day_of_week", F.dayofweek("tpep_pickup_datetime"))

In [33]:
taxi_df = taxi_df.withColumn("pickup_month", F.month("tpep_pickup_datetime"))

## Group and Aggregate:
Group the data by time of day, day of week, and month of year, and calculate the average duration and distance for each group:

In [34]:
agg_df = taxi_df.groupBy("pickup_hour", "pickup_day_of_week", "pickup_month").agg(
    F.avg("trip_duration").alias("avg_duration"),
    F.avg("avg_distance").alias("avg_distance")
).orderBy("pickup_hour", "pickup_day_of_week", "pickup_month")

## Show the Results:
Show the aggregated results:

In [35]:
agg_df.show()



+-----------+------------------+------------+------------------+------------------+
|pickup_hour|pickup_day_of_week|pickup_month|      avg_duration|      avg_distance|
+-----------+------------------+------------+------------------+------------------+
|          0|                 1|           1| 888.9417608770127|3.2521391261294736|
|          0|                 1|           2| 726.9726522187823|  2.89541996285979|
|          0|                 1|           3| 888.4640534063677|2.8471578638497625|
|          0|                 1|           4| 854.6067429406037|2.9782131248083417|
|          0|                 1|           5| 883.9497659700705|2.8323654661521616|
|          0|                 1|           6| 954.6639178045153| 2.721734596752614|
|          0|                 1|           7|1011.6342042755344| 2.835302943232113|
|          0|                 1|           8| 997.7019489609131|2.8669667749237213|
|          0|                 1|           9|1057.5752172184677|2.5259332031

                                                                                

## Group and Count Pickup Locations:

In [37]:
pickup_locations = taxi_df.groupBy("PULocationID").count().orderBy(F.desc("count"))
top_pickup_locations = pickup_locations.limit(10)

In [None]:
dropoff_locations = taxi_df.groupBy("DOLocationID").count().orderBy(F.desc("count"))
top_dropoff_locations = dropoff_locations.limit(10)