In [51]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from math import radians, cos, sin, asin, sqrt
import numpy as np

In [52]:
input_csv_path = 'data/NY_HOURLY_RIDERSHIP_2022.csv'
output_csv_path = 'data/NY_AGGREGATED_RIDERSHIP_2022.csv'

In [53]:
chunk_size = 10000
sample_start_date = '2023-01-01'
sample_end_date = '2023-07-01'

In [54]:
def map_hour_to_period(hour):
    if 1 <= hour <= 6:
        return 1
    elif 7 <= hour <= 12:
        return 2
    elif 13 <= hour <= 18:
        return 3
    elif 19 <= hour <= 24 or hour == 0:
        return 4

In [64]:
def preprocess_chunk(chunk_to_process):
    chunk_to_process = chunk_to_process.copy()

    chunk_to_process = chunk_to_process[(chunk_to_process['transit_timestamp'] >= sample_start_date) & (chunk_to_process['transit_timestamp'] <= sample_end_date)]
    
    chunk_to_process['hour'] = chunk_to_process['transit_timestamp'].dt.hour
    chunk_to_process['weekday'] = chunk_to_process['transit_timestamp'].dt.weekday
    chunk_to_process['period'] = chunk_to_process['hour'].apply(map_hour_to_period)
    
    return chunk_to_process

In [65]:
def aggregate_chunk(chunk_to_aggregate):
    chunk_to_aggregate = chunk_to_aggregate.copy()

    chunk_to_aggregate = chunk_to_aggregate.groupby(['period', 'weekday', 'station_complex_id']).agg({
        'station_complex': 'first',
        'ridership': 'sum',
    }).reset_index()
    
    return chunk_to_aggregate

In [75]:
chunk_iter = pd.read_csv(input_csv_path, chunksize=chunk_size, parse_dates=['transit_timestamp'], date_format='%m/%d/%Y %I:%M:%S %p')

sampled_chunks = []

for chunk in chunk_iter:
    preprocessed_chunk = preprocess_chunk(chunk)
    grouped_chunk = aggregate_chunk(preprocessed_chunk)

    if not grouped_chunk.empty:
        sampled_chunks.append(grouped_chunk)

In [78]:
df = pd.concat(sampled_chunks)
df = aggregate_chunk(df)

df.to_csv(output_csv_path, index=False)