In [1]:
# Step 1: Install PySpark
!pip install pyspark
# Step 2: Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, count




In [2]:
# Step 3: Create Spark Session
spark = SparkSession.builder \
    .appName("Big Data Analysis - NYC Taxi") \
    .getOrCreate()



In [3]:
# Step 4: Upload CSV File
from google.colab import files
uploaded = files.upload()


Saving nyc_taxi_sample.csv to nyc_taxi_sample.csv


In [6]:
# Step 5: Load CSV Data
# Change the file path to the filename as uploaded files are typically in the current directory
df = spark.read.csv("nyc_taxi_sample.csv", header=True, inferSchema=True)
df.show(5)

+-------------------+-------------------+---------------+-------------+------------+
|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|total_amount|
+-------------------+-------------------+---------------+-------------+------------+
|2019-01-01 00:00:00|2019-01-01 00:10:00|              4|         0.04|       13.05|
|2019-01-01 00:01:00|2019-01-01 00:11:00|              5|         0.51|       14.54|
|2019-01-01 00:02:00|2019-01-01 00:12:00|              3|         2.62|        5.95|
|2019-01-01 00:03:00|2019-01-01 00:13:00|              5|         0.26|        4.73|
|2019-01-01 00:04:00|2019-01-01 00:14:00|              5|         0.41|        9.94|
+-------------------+-------------------+---------------+-------------+------------+
only showing top 5 rows



In [7]:
# Step 6: Explore Dataset
df.printSchema()
print("Total records:", df.count())


root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- total_amount: double (nullable = true)

Total records: 100000


In [8]:
# Step 7: Average Trip Distance
df.select(avg("trip_distance")).show()


+------------------+
|avg(trip_distance)|
+------------------+
| 2.000701299999989|
+------------------+



In [9]:
# Step 8: Maximum Fare Amount
df.select(max("total_amount")).show()


+-----------------+
|max(total_amount)|
+-----------------+
|           159.72|
+-----------------+



In [10]:
# Step 9: Group by Passenger Count
df.groupBy("passenger_count").count().orderBy("passenger_count").show()


+---------------+-----+
|passenger_count|count|
+---------------+-----+
|              1|20018|
|              2|20082|
|              3|19732|
|              4|19981|
|              5|20187|
+---------------+-----+



In [11]:
# Step 10: Filter Long & Expensive Trips
df.filter((col("trip_distance") > 10) & (col("total_amount") > 50)).show(5)


+-------------------+-------------------+---------------+-------------+------------+
|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|total_amount|
+-------------------+-------------------+---------------+-------------+------------+
|2019-01-01 07:48:00|2019-01-01 07:58:00|              1|         11.5|       52.08|
|2019-01-02 12:06:00|2019-01-02 12:16:00|              3|         10.9|       54.65|
|2019-01-02 16:44:00|2019-01-02 16:54:00|              4|        16.25|        75.8|
|2019-01-09 08:42:00|2019-01-09 08:52:00|              2|        10.25|       70.97|
|2019-01-19 12:17:00|2019-01-19 12:27:00|              1|        13.46|       77.65|
+-------------------+-------------------+---------------+-------------+------------+
only showing top 5 rows



In [12]:
# Step 11: Stop Spark Session
spark.stop()
