In [2]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
input_csv_path = 'data/NY_HOURLY_RIDERSHIP_2022.csv'
output_csv_path = 'data/NY_SAMPLED_RIDERSHIP_2022.csv'
holidays_csv_path = 'data/Holidays Dataset US.csv'

# Sampling the dataset

In [19]:
chunk_size = 10000

In [32]:
sample_start_date = '2023-01-01'
sample_end_date = '2023-01-31'

In [21]:
# Create an iterator over chunks of the CSV file
chunk_iter = pd.read_csv(input_csv_path, chunksize=chunk_size, parse_dates=['transit_timestamp'], date_parser = lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))

sampled_chunks = []

for chunk in chunk_iter:
    filtered_chunk = chunk[(chunk['transit_timestamp'] >= sample_start_date) & (chunk['transit_timestamp'] <= sample_end_date)]
    
    if not filtered_chunk.empty:
        sampled_chunks.append(filtered_chunk)

  chunk_iter = pd.read_csv(input_csv_path, chunksize=chunk_size, parse_dates=['transit_timestamp'], date_parser=dateparse)


In [None]:
sampled_df = pd.concat(sampled_chunks)
sampled_df.to_csv(output_csv_path, index=False)

# Preprocessing

In [None]:
sampled_df = pd.read_csv(output_csv_path, parse_dates=['transit_timestamp'])

In [33]:
sampled_df = sampled_df.drop('payment_method', axis=1)
sampled_df = sampled_df.drop('fare_class_category', axis=1)
sampled_df = sampled_df.drop('Georeference', axis=1)
sampled_df = sampled_df.drop('transit_mode', axis=1)

In [34]:
sampled_df['date'] = sampled_df['transit_timestamp'].dt.date

In [55]:
# Adding a is holiday column

holidays_dataset = pd.read_csv(holidays_csv_path)

holidays_dataset = holidays_dataset[holidays_dataset['Type'] == "['National holiday']"]

holidays_dataset['Date'] = pd.to_datetime(holidays_dataset['Date'], errors='coerce')
holiday_dates = set(holidays_dataset['Date'].dt.date)

sampled_df['is_holiday'] = sampled_df['date'].apply(lambda x: x in holiday_dates)

# Data analysis

In [None]:
sampled_df['day_of_week'] = sampled_df['transit_timestamp'].dt.day_name()
sampled_df['hour'] = sampled_df['transit_timestamp'].dt.hour

days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

for day in days_order:
    day_df = sampled_df[sampled_df['day_of_week'] == day]

    pivot_df = day_df.pivot_table(index='hour', columns='station_complex', values='ridership', aggfunc='sum')

    pivot_df['Average'] = pivot_df.mean(axis=1)

    plt.figure(figsize=(10, 5)) 
    sns.lineplot(data=pivot_df.drop(columns=['Average']), palette=['blue']*(len(pivot_df.columns)-1), legend=False, linewidth=1)
    sns.lineplot(data=pivot_df['Average'], color='red', linewidth=2) 
    plt.title(f'Ridership by Hour for {day}')
    plt.xlabel('Hour of the Day')
    plt.ylabel('Ridership')
    plt.xticks(range(24))
    plt.show()