In [2]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium 

In [3]:
from pyspark.sql import SparkSession

# Create a spark session with increased memory allocation
spark = (
    SparkSession.builder.appName("ADS Project1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "8g")  # Set the driver memory to 8GB
    .config("spark.executor.memory", "8g")  # Set the executor memory to 8GB
    .getOrCreate()
)

24/08/30 01:23:53 WARN Utils: Your hostname, Hanshis-Laptop.local resolves to a loopback address: 127.0.0.1; using 100.94.176.147 instead (on interface en0)
24/08/30 01:23:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 01:23:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Datasets 

In [4]:
# read the data from preprocessed tlc data
tdf = spark.read.parquet("../data/raw/tlc_df.parquet") 

In [5]:
# read the preprocessed weather data
wdf = spark.read.csv("../data/raw/NYC_weather_raw.csv", header=True, inferSchema=True)

In [6]:
# read the preprocessed event data
edf = spark.read.parquet("../data/raw/NYC_Permitted_Event_Information_Historical.parquet") 

In [7]:
# Load the zones DataFrame
zones = spark.read.csv("../data/landing/external/taxi_zones.csv", header=True, inferSchema=True)

In [8]:
# Aggregate the number of events per date, hour, and borough
edf_daily = edf.groupBy('Start Date', 'Event Borough').agg(count('Event ID').alias('Number of Events'))

In [9]:
# Join the tdf DataFrame with zones to get the borough for PULocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'PUBorough'),
               tdf['PULocationID'] == zones['LocationID'], 'left').drop('LocationID')

# Join the tdf DataFrame with zones to get the borough for DOLocationID
tdf = tdf.join(zones.select('LocationID', 'borough').withColumnRenamed('borough', 'DOBorough'),
               tdf['DOLocationID'] == zones['LocationID'], 'left').drop('LocationID')

In [10]:
# Calculate correlations for hourly trip count
temp_hourly_corr = df['TMP'].corr(df['hourly_trip_count'])
vis_hourly_corr = df['VIS'].corr(df['hourly_trip_count'])
events_hourly_corr = np.log1p(df['Number of Events']).corr(df['hourly_trip_count'])

# Calculate correlations for daily trip count
temp_daily_corr = df['TMP'].corr(df['daily_trip_count'])
vis_daily_corr = df['VIS'].corr(df['daily_trip_count'])
events_daily_corr = df['Number of Events'].corr(df['daily_trip_count'])

# Create a DataFrame to hold the correlation results
correlation_matrix = pd.DataFrame({
    'Correlation': ['temp_hourly_corr', 'vis_hourly_corr', 'events_hourly_corr', 
                    'temp_daily_corr', 'vis_daily_corr', 'events_daily_corr'],
    'Value': [temp_hourly_corr, vis_hourly_corr, events_hourly_corr, 
              temp_daily_corr, vis_daily_corr, events_daily_corr]
})

# Display the correlation matrix
print(correlation_matrix)


NameError: name 'df' is not defined

### Daily demand

In [None]:
# Group by 'pickup_date' and 'PUBorough' to get the sum of trips and average weather data
daily_demand = clean_df.groupby(['pickup_date', 'PUBorough']).agg({
    'daily_trip_count': 'sum',
    'TMP': 'mean',
    'VIS': 'mean',
    'Number of Events': 'sum'
}).reset_index()

# Rename the columns to match the desired output
daily_demand.rename(columns={
    'daily_trip_count': 'total_daily_trips',
    'TMP': 'avg_temperature',
    'VIS': 'avg_visibility',
    'Number of Events': 'total_events'
}, inplace=True)

# Round the average temperature and visibility to 1 decimal place
daily_demand['avg_temperature'] = daily_demand['avg_temperature'].round(1)
daily_demand['avg_visibility'] = daily_demand['avg_visibility'].round(1)

# Show the first 5 rows of the aggregated result
print(daily_demand.head(5))


### Daily Trips by borough

In [None]:
# Aggregate the number of daily trips by PUBorough
agg_data_daily = clean_df.groupby('PUBorough').agg({
    'daily_trip_count': 'sum'
}).reset_index()

# Rename the columns to match the expected output
agg_data_daily.rename(columns={'PUBorough': 'borough', 'daily_trip_count': 'total_daily_trips'}, inplace=True)

# Display the first 5 rows of the aggregated data
print(agg_data_daily.head())

# Create a bar chart with a log scale
fig, ax = plt.subplots(figsize=(10, 6))

# Create the bar plot using Seaborn
sbs.barplot(x='borough', y='total_daily_trips', data=agg_data_daily, ax=ax)

# Set the y-axis to a logarithmic scale
ax.set_yscale('log')

# Set the title of the plot
ax.set_title('Total Daily Trips by Borough')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)

# Show the plot
plt.show()


In [None]:
# save the plot
fig.savefig("../plots/total_daily_trips_by_borough.png")

### Pickups for each day of week

In [None]:
# Plot number of pick ups per day of the week
# Group by 'Day of Week' and count the number of trips
trips_per_day_df = clean_df.groupby('Day of Week').agg({'daily_trip_count': 'sum'}).reset_index()

In [None]:
# Plot the number of trips per day of the week
plt.figure(figsize=(12, 6))
sbs.barplot(x='Day of Week', y='daily_trip_count', data=trips_per_day_df, order=days_order)
plt.title('Number of Trips Per Day of the Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Trips')
plt.grid(True)
plt.tight_layout()

In [None]:
# save the plot
plt.savefig('../plots/number_of_trips_per_day_of_week.png')