# EDA on NYC Taxi Tip Data

In [0]:
# Load data
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NYC Taxi Tip EDA").getOrCreate()
df = spark.read.csv("/FileStore/tables/NYC_Taxi_Trip_Record_Clean.csv", header=True, inferSchema=True)
df.cache()

In [0]:
# 2. 
df.printSchema()

In [0]:

df.show(5)

In [0]:

df.count()

In [0]:
# 5. Get summary statistics (count, mean, stddev, min, max) of numeric columns?

df.describe().show()

In [0]:
# 6. Calculate average tip amount grouped by passenger count
# TODO: Write the code to answer the above question

from pyspark.sql.functions import avg

# Assuming your DataFrame is named `df`
avg_tip_by_passenger = df.groupBy("passenger_count").agg(avg("tip_amount").alias("average_tip"))

avg_tip_by_passenger.orderBy("passenger_count").show()


In [0]:
# 7. Calculate total tip amount by payment type
# TODO: Write the code to answer the above question

from pyspark.sql.functions import sum

# Group by payment_type and sum the tip_amount
total_tip_by_payment = df.groupBy("payment_type").agg(sum("tip_amount").alias("total_tip_amount"))

# Show results
total_tip_by_payment.orderBy("total_tip_amount", ascending=False).show()



In [0]:
# 8. Display records where the tip amount is greater than 5
# TODO: Write the code to answer the above question

df.filter(df.tip_amount > 5).show()



In [0]:
# 9. Identify outliers where tip amount is greater than 50?
# TODO: Write the code to answer the above question

# Filter records where tip_amount > 50
outliers = df.filter(df.tip_amount > 50)

# Display them
outliers.select("lpep_pickup_datetime", "tip_amount", "fare_amount", "passenger_count", "payment_type", "trip_distance").show()



In [0]:
# 10. How to calculate the correlation between trip distance and tip amount?
# TODO: Write the code to answer the above question
correlation = df.corr("trip_distance", "tip_amount")
print(f"Correlation between trip_distance and tip_amount: {correlation}")


In [0]:
# 11. Get average tip amount by day of the week
# TODO: Write the code to answer the above question

from pyspark.sql.functions import avg, date_format, expr

df.withColumn("day_of_week", date_format("lpep_pickup_datetime", "EEEE")) \
  .groupBy("day_of_week") \
  .agg(avg("tip_amount").alias("average_tip")) \
  .withColumn("day_order", expr("""
      CASE day_of_week
          WHEN 'Sunday' THEN 1
          WHEN 'Monday' THEN 2
          WHEN 'Tuesday' THEN 3
          WHEN 'Wednesday' THEN 4
          WHEN 'Thursday' THEN 5
          WHEN 'Friday' THEN 6
          WHEN 'Saturday' THEN 7
      END
  """)) \
  .orderBy("day_order") \
  .select("day_of_week", "average_tip") \
  .show()


In [0]:
# 12. Get average tip amount by hour of the day
# TODO: Write the code to answer the above question

from pyspark.sql.functions import hour, avg

df.withColumn("pickup_hour", hour("lpep_pickup_datetime")) \
  .groupBy("pickup_hour") \
  .agg(avg("tip_amount").alias("average_tip")) \
  .orderBy("pickup_hour") \
  .show()


In [0]:
# 13. Calculate tip amount per mile and describe its statistics
# TODO: Write the code to answer the above question

from pyspark.sql.functions import col

# Calculate tip per mile, avoiding division by zero
df_with_tip_per_mile = df.withColumn("tip_per_mile", col("tip_amount") / col("trip_distance"))

# Filter out invalid or infinite values (e.g., zero distance)
df_filtered = df_with_tip_per_mile.filter((col("trip_distance") > 0) & (col("tip_amount") >= 0))

# Describe statistics of tip per mile
df_filtered.select("tip_per_mile").describe().show()


In [0]:
# 14. Get records with invalid fare or tip amounts. Then remove these invalid records from dataframe (make it clean)
# TODO: Write the code to answer the above question

from pyspark.sql.functions import col

# Identify invalid records where fare or tip amount is negative or missing (NULL)
invalid_records = df.filter((col("fare_amount") < 0) | (col("tip_amount") < 0) | col("fare_amount").isNull() | col("tip_amount").isNull())

# Show invalid records
invalid_records.show()

# Remove invalid records from the DataFrame
clean_df = df.filter((col("fare_amount") >= 0) & (col("tip_amount") >= 0) & col("fare_amount").isNotNull() & col("tip_amount").isNotNull())

# Show cleaned DataFrame (optional: to verify)
clean_df.show()


In [0]:
# 15. Calculate average tip by vendor ID
# TODO: Write the code to answer the above question
#calculate avg tip by trip time (changed ques)
from pyspark.sql.functions import avg

# Calculate average tip by PUZone (Pickup Zone)
avg_tip_by_pu_zone = df.groupBy("PUZone").agg(avg("tip_amount").alias("average_tip"))

# Show the result
avg_tip_by_pu_zone.show()



In [0]:
# 16. Get average tip amount per hour. Show graph as visualization
# TODO: Write the code to answer the above question

from pyspark.sql.functions import hour, avg

# Calculate the hour of the day from the pickup time and compute the average tip amount
avg_tip_per_hour = df.withColumn("pickup_hour", hour("lpep_pickup_datetime")) \
  .groupBy("pickup_hour") \
  .agg(avg("tip_amount").alias("average_tip")) \
  .orderBy("pickup_hour")

# Display the result as a graph
display(avg_tip_per_hour)



In [0]:
# 17. Get average tip by passenger count and display as bar chart
# TODO: Write the code to answer the above question

from pyspark.sql.functions import avg

# Group by passenger count and calculate the average tip amount
avg_tip_by_passenger_count = df.groupBy("passenger_count").agg(avg("tip_amount").alias("average_tip"))

# Display the result as a bar chart
display(avg_tip_by_passenger_count)


In [0]:
# 18. Get fare vs tip values and display scatter plot style visualization
# TODO: Write the code to answer the above question

# Select the fare_amount and tip_amount columns
fare_vs_tip = df.select("fare_amount", "tip_amount")
# Display as a scatter plot
display(fare_vs_tip)
