## Exploratory Data Analysis with PySpark
---

In [1]:
# Install packages
sc.install_pypi_package("pandas")
sc.install_pypi_package("numpy")
sc.install_pypi_package("seaborn")
sc.install_pypi_package("matplotlib")
sc.install_pypi_package("boto3")

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4,application_1741295189031_0005,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Collecting tzdata>=2022.7
  Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Collecting python-dateutil>=2.8.2
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Collecting numpy>=1.22.4
  Downloading numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
Installing collected packages: tzdata, python-dateutil, numpy, pandas
  Attempting uninstall: python-dateutil
    Found existing installation: python-dateutil 2.8.1
    Not uninstalling python-dateutil at /usr/lib/python3.9/site-packages, outside environment /mnt/yarn/usercache/livy/appcache/application_1741295189031_0005/container_1741295189031_0005_01_000001/tmp/spark-05792363-6c8e-4aea-8337-885fae8083eb
    Can't uninstall 'python-dateutil'. No files were found to uninstall.
Successfully installed numpy-2.0.2 pandas-2.2.3 python-dateutil-2.9.0.post0 tzdata-2025.1


Colle

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import boto3
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, substring

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Initialize a Spark session
spark = SparkSession.builder.appName("FinalProject - Exploratory Data Analysis").getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Load the 2024_chicago_taxi_and_weather dataset from an S3 bucket
taxi_and_weather_path = "s3://csc555-jaewon/final_project/2024_chicago_taxi_and_weather/20250305_222658_00067_pntyd_bf5908a3-a2ab-433b-830a-6f02200e2504"
taxi_and_weather_df = spark.read.parquet(taxi_and_weather_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
taxi_and_weather_df.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+-----+--------------+--------------------+-------------+-------------+--------------------+-------------+-----------+
|    hour_slot|count|temperature_2m|relative_humidity_2m|precipitation|windspeed_10m|apparent_temperature|windgusts_10m|weathercode|
+-------------+-----+--------------+--------------------+-------------+-------------+--------------------+-------------+-----------+
|01/01/2024 00|  462|           0.0|                85.0|          0.1|         30.4|                -6.9|         44.3|         71|
|01/01/2024 01|  522|           0.1|                86.0|          0.1|         30.1|                -6.7|         44.6|         71|
|01/01/2024 02|  490|           0.4|                85.0|          0.1|         27.8|                -6.1|         43.6|         71|
|01/01/2024 03|  269|           0.2|                85.0|          0.0|         25.4|                -6.0|         40.0|          3|
|01/01/2024 04|  150|          -0.3|                85.0|          0.

---
### 1. EDA - Hourly Taxi Usage Heatmap

In [6]:
# Create a new column and group the data
taxi_usage_df = taxi_and_weather_df.withColumn("hour", substring(col("Hour_Slot"), -2, 2))
taxi_usage_df = taxi_usage_df.groupBy("hour").sum("count").orderBy("hour")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# Convert the PySpark DataFrame to a Pandas DataFrame
taxi_usage_pd = taxi_usage_df.toPandas()
taxi_usage_pd["hour"] = taxi_usage_pd["hour"].astype(int)
taxi_usage_pd = taxi_usage_pd.sort_values("hour")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# Set the 'hour' column as the index for the heatmap
taxi_usage_heatmap = taxi_usage_pd.set_index("hour")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
# Plot the heatmap
plt.figure(figsize = (20, 12))
sns.heatmap(taxi_usage_heatmap, cmap = "coolwarm", annot = True, fmt = ".0f")
plt.xlabel("Taxi Usage")
plt.ylabel("Hour of Day")
plt.title("Hourly Taxi Usage Heatmap")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Text(0.5, 1.0, 'Hourly Taxi Usage Heatmap')

In [10]:
# Save the heatmap image locally
local_path = '/tmp/hourly_taxi_usage_heatmap.png'
plt.savefig(local_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
# Upload the heatmap image to an S3 bucket
s3 = boto3.client('s3')
bucket_name = 'csc555-jaewon'
s3_path = 'final_project/2024_chicago_taxi_and_weather/hourly_taxi_usage_heatmap.png'
s3.upload_file(local_path, bucket_name, s3_path)
print(f"Heatmap successfully uploaded to s3://{bucket_name}/{s3_path}")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Heatmap successfully uploaded to s3://csc555-jaewon/final_project/2024_chicago_taxi_and_weather/hourly_taxi_usage_heatmap.png

In [12]:
# Remove the locally saved image
os.remove(local_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

---
### 2. EDA - Correlation Heatmap between Taxi Count and Weather Data

In [13]:
# Select relevant columns for correlation analysis
weather_df = taxi_and_weather_df.select('count', 'temperature_2m', 'relative_humidity_2m', 'precipitation', 'windspeed_10m', 'apparent_temperature', 'windgusts_10m')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
# Convert the PySpark DataFrame to a Pandas DataFrame
weather_pd = weather_df.toPandas()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
# Calculate Pearson correlation matrix
corr_matrix = weather_pd.corr(method = "pearson")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
# Plot the correlation heatmap
plt.figure(figsize = (20, 12))
sns.heatmap(corr_matrix, annot = True, cmap = "coolwarm", fmt = ".2f", linewidths = 0.5, mask = np.triu(np.ones_like(corr_matrix, dtype = bool)))
plt.title("Correlation Heatmap")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Text(0.5, 1.0, 'Correlation Heatmap')

In [17]:
# Save the correlation heatmap image locally
local_path = '/tmp/correlation_heatmap.png'
plt.savefig(local_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
# Upload the correlation heatmap image to S3
s3 = boto3.client('s3')
bucket_name = 'csc555-jaewon'
s3_path = 'final_project/2024_chicago_taxi_and_weather/correlation_heatmap.png'
s3.upload_file(local_path, bucket_name, s3_path)
print(f"Heatmap successfully uploaded to s3://{bucket_name}/{s3_path}")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Heatmap successfully uploaded to s3://csc555-jaewon/final_project/2024_chicago_taxi_and_weather/correlation_heatmap.png

In [19]:
# Remove the locally saved image
os.remove(local_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…