In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, hour, dayofweek, month
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [10]:
spark = SparkSession.builder.appName("TaxiAnalysis").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/04 23:58:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def load_data(file_path):
    """
    column names: VendorID, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, RatecodeID, store_and_fwd_flag, PULocationID, DOLocationID, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee
    """
    # Load the NYC taxi data from Parquet files
    data_nyc = spark.read.parquet(file_path)

    return data_nyc

In [3]:
def trip_analysis(data):
    """_summary_
    Function Task: Average duration and distance of rides: Compare these metrics by time of day, day of week, 
    and month of year. This can reveal patterns such as longer trips during rush hours, on 
    weekends, or during holiday seasons.
    """
    # Average duration and distance of rides by time of day, day of week, and month of year
    window = Window.orderBy("pickup_datetime")

    trip = data.withColumn("hour", hour("pickup_datetime")) \
        .withColumn("day_of_week", dayofweek("pickup_datetime")) \
        .withColumn("month", month("pickup_datetime"))

    avg_duration_by_hour = trip.groupBy(
        "hour").avg("trip_duration_minutes")
    avg_distance_by_hour = trip.groupBy("hour").avg("trip_distance")

    avg_duration_by_day = trip.groupBy(
        "day_of_week").avg("trip_duration_minutes")
    avg_distance_by_day = trip.groupBy(
        "day_of_week").avg("trip_distance")

    avg_duration_by_month = trip.groupBy(
        "month").avg("trip_duration_minutes")
    avg_distance_by_month = trip.groupBy("month").avg("trip_distance")

    avg_duration_by_hour.show()
    avg_distance_by_hour.show()
    avg_duration_by_day.show()
    avg_distance_by_day.show()
    avg_duration_by_month.show()
    avg_distance_by_month.show()

In [4]:
def popular_locations(data):
    # Identify the top 10 pickup and dropoff locations
    top_pickup_locations = data.groupBy(
        "PULocationID").count().orderBy(col("count").desc()).limit(10)
    top_dropoff_locations = data.groupBy(
        "DOLocationID").count().orderBy(col("count").desc()).limit(10)

    top_pickup_locations.show()
    top_dropoff_locations.show()

In [5]:
def tip_analysis(data):
    # Tip percentage by trip
    data = data.withColumn("tip_percentage", col(
        "tip_amount") / col("total_amount") * 100)

    # Tips by time: Does the time of day, week, or year affect tipping behavior?
    tip_by_hour = data.groupBy(hour("pickup_datetime")).avg(
        "tip_percentage").orderBy("hour")
    tip_by_day_of_week = data.groupBy(dayofweek("pickup_datetime")).avg(
        "tip_percentage").orderBy("dayofweek")
    tip_by_month = data.groupBy(month("pickup_datetime")).avg(
        "tip_percentage").orderBy("month")

    tip_by_hour.show()
    tip_by_day_of_week.show()
    tip_by_month.show()

In [6]:
def fare_analysis(data):
    # Can you calculate the average fare by pickup & drop-off location?
    avg_fare_by_pickup_dropoff = data.groupBy(
        "PULocationID", "DOLocationID").avg("fare_amount")
    avg_fare_by_pickup_dropoff.show()

    # Can you calculate the average fare by Passenger count?
    avg_fare_by_passenger_count = data.groupBy(
        "passenger_count").avg("fare_amount")
    avg_fare_by_passenger_count.show()

    # Can you correlate the fare amount and the distance trip?
    data = data.withColumn("fare_per_distance", col(
        "fare_amount") / col("trip_distance"))

    correlation_fare_distance = data.stat.corr(
        "fare_per_distance", "trip_distance")
    print("Correlation between fare and trip distance:", correlation_fare_distance)

In [7]:
def traffic_analysis(data):
    # Calculate the average speed of a trip (average trip speed in miles per hour)
    data = data.withColumn("trip_speed", col(
        "trip_distance") / (col("trip_duration_minutes") / 60))

    # Group the average speed by trip then hour, day, week
    avg_speed_by_hour = data.groupBy(
        hour("pickup_datetime")).avg("trip_speed").orderBy("hour")
    avg_speed_by_day_of_week = data.groupBy(
        dayofweek("pickup_datetime")).avg("trip_speed").orderBy("dayofweek")
    avg_speed_by_month = data.groupBy(
        month("pickup_datetime")).avg("trip_speed").orderBy("month")

    avg_speed_by_hour.show()
    avg_speed_by_day_of_week.show()
    avg_speed_by_month.show()

In [8]:
def demand_prediction(data):
    # Feature engineering: Use the date and time of the pickups to create features for the model
    data = data.withColumn("hour_of_day", hour("pickup_datetime"))
    data = data.withColumn("day_of_week", dayofweek("pickup_datetime"))
    data = data.withColumn("month", month("pickup_datetime"))

    # Regression model: Use linear regression to predict the number of pickups in the next hour
    assembler = VectorAssembler(
        inputCols=["hour_of_day", "day_of_week", "month"], outputCol="features")
    data = assembler.transform(data)

    lr = LinearRegression(featuresCol="features",
                          labelCol="pickups_in_next_hour")
    lr_model = lr.fit(data)

    # Make predictions for the next hour based on the features
    next_hour_data = spark.createDataFrame(
        [(18, 2, 7)], ["hour_of_day", "day_of_week", "month"])
    next_hour_data = assembler.transform(next_hour_data)
    predictions = lr_model.transform(next_hour_data)

    predicted_pickups = predictions.select("prediction").first()[0]
    print("Predicted number of pickups in the next hour:", predicted_pickups)

In [11]:
cwd = os.getcwd()
# change this with your google cloud storage path
path = os.path.join(cwd, 'NYC', '*.parquet')
data_df = load_data(path)

In [12]:
trip_analysis(data_df)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `pickup_datetime` cannot be resolved. Did you mean one of the following? [`tpep_pickup_datetime`, `tpep_dropoff_datetime`, `airport_fee`, `payment_type`, `tip_amount`].;
'Project [VendorID#0L, tpep_pickup_datetime#1, tpep_dropoff_datetime#2, passenger_count#3, trip_distance#4, RatecodeID#5, store_and_fwd_flag#6, PULocationID#7L, DOLocationID#8L, payment_type#9L, fare_amount#10, extra#11, mta_tax#12, tip_amount#13, tolls_amount#14, improvement_surcharge#15, total_amount#16, congestion_surcharge#17, airport_fee#18, hour('pickup_datetime, Some(Europe/Paris)) AS hour#38]
+- Relation [VendorID#0L,tpep_pickup_datetime#1,tpep_dropoff_datetime#2,passenger_count#3,trip_distance#4,RatecodeID#5,store_and_fwd_flag#6,PULocationID#7L,DOLocationID#8L,payment_type#9L,fare_amount#10,extra#11,mta_tax#12,tip_amount#13,tolls_amount#14,improvement_surcharge#15,total_amount#16,congestion_surcharge#17,airport_fee#18] parquet
