In [15]:
import pandas as pd
from pycaret.regression import *
from datetime import datetime, timedelta

In [16]:
input_csv_path = 'data/NY_HOURLY_RIDERSHIP_2022.csv'
# output_csv_path = 'data/NY_AGGREGATED_RIDERSHIP_2022.csv'

In [17]:
chunk_size = 50000
base_date = datetime.strptime('2023-01-01', '%Y-%m-%d')
intervals = {
    # 'one_day': (base_date, base_date + timedelta(days=1)),
    # 'one_week': (base_date, base_date + timedelta(weeks=1)),
    # 'one_month': (base_date, base_date + timedelta(days=30)),
    # 'two_months': (base_date, base_date + timedelta(days=30*2)),
    # 'three_months': (base_date, base_date + timedelta(days=30*3)),
    'five_months': (base_date, base_date + timedelta(days=30*5))
}

In [18]:
def map_hour_to_period(hour):
    if 1 <= hour <= 6:
        return 1
    elif 7 <= hour <= 12:
        return 2
    elif 13 <= hour <= 18:
        return 3
    elif 19 <= hour <= 24 or hour == 0:
        return 4

In [19]:
def process_interval(start_date, end_date):
    iterator = pd.read_csv(input_csv_path, chunksize=chunk_size, parse_dates=['transit_timestamp'], date_format='%Y-%m-%d %H:%M:%S')
    
    chunks_to_save = []

    for current_chunk in iterator:
        current_chunk['transit_timestamp'] = pd.to_datetime(current_chunk['transit_timestamp'], format='%m/%d/%Y %I:%M:%S %p', errors='raise')
        
        chunk_being_processed = current_chunk[(current_chunk['transit_timestamp'] >= start_date) & (current_chunk['transit_timestamp'] < end_date)]
        chunk_being_processed['date'] = chunk_being_processed['transit_timestamp'].dt.date
        chunk_being_processed['hour'] = chunk_being_processed['transit_timestamp'].dt.hour
        chunk_being_processed['weekday'] = chunk_being_processed['transit_timestamp'].dt.weekday
        chunk_being_processed['period'] = chunk_being_processed['hour'].apply(map_hour_to_period)
        
        chunk_being_grouped = chunk_being_processed.groupby(['period', 'weekday', 'station_complex_id']).agg({
            'ridership': 'mean',
            'latitude': 'first',
            'longitude': 'first'
        }).reset_index()

        if not chunk_being_grouped.empty:
            chunks_to_save.append(chunk_being_grouped)

    dataframe = pd.concat(chunks_to_save)
    dataframe = dataframe.groupby(['period', 'weekday', 'station_complex_id']).agg({
        'ridership': 'mean',
        'latitude': 'first',
        'longitude': 'first'
    }).reset_index()
    
    return dataframe

In [20]:
all_intervals_df = []
for label, (start, end) in intervals.items():
    print(f"Initiating processing of interval {start} to {end}")
    
    interval_df = process_interval(start, end)
    
    all_intervals_df.append(interval_df)

final_df = pd.concat(all_intervals_df)

Initiating processing of interval 2023-01-01 00:00:00 to 2023-05-31 00:00:00


In [21]:
final_df = final_df.reset_index()

final_df['station_complex_id'] = final_df['station_complex_id'].astype(str)
final_df = final_df.drop('index', axis=1)

In [22]:
setup(data=final_df, target='ridership', session_id=123, train_size=0.8, categorical_features=['station_complex_id'], normalize=True)

best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,ridership
2,Target type,Regression
3,Original data shape,"(12043, 6)"
4,Transformed data shape,"(12043, 6)"
5,Transformed train set shape,"(9634, 6)"
6,Transformed test set shape,"(2409, 6)"
7,Numeric features,4
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,5.0442,202.7457,13.9343,0.9481,0.205,0.1616,0.913
rf,Random Forest Regressor,5.0777,212.1217,14.2631,0.946,0.1935,0.1521,1.232
lightgbm,Light Gradient Boosting Machine,7.0674,303.3005,17.122,0.9227,0.2803,0.2552,0.223
dt,Decision Tree Regressor,6.1744,346.182,17.8994,0.9101,0.2391,0.1773,0.049
gbr,Gradient Boosting Regressor,10.4501,477.5274,21.6953,0.8743,0.4563,0.4586,0.359
knn,K Neighbors Regressor,9.74,536.7134,22.8059,0.8628,0.3487,0.3321,0.055
lar,Least Angle Regression,22.2638,1709.0173,41.1415,0.5467,0.8565,1.321,0.039
br,Bayesian Ridge,22.2622,1709.0289,41.1415,0.5467,0.8564,1.3209,0.037
ridge,Ridge Regression,22.2633,1709.0206,41.1415,0.5467,0.8565,1.3209,0.038
lr,Linear Regression,22.2638,1709.0173,41.1415,0.5467,0.8565,1.321,0.755


In [23]:
# tuned_model = tune_model(best_model)

In [24]:
final_model = finalize_model(best_model)
predictions = predict_model(final_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,5.8135,279.7224,16.7249,0.9473,0.2312,0.1938


In [25]:
evaluate_model(final_model)

metrics = pull()
print(metrics)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

                   Model     MAE       MSE     RMSE      R2   RMSLE    MAPE
0  Extra Trees Regressor  5.8135  279.7224  16.7249  0.9473  0.2312  0.1938


In [26]:
test_base_date = datetime.strptime('2023-06-01', '%Y-%m-%d')
test_intervals = {
    # 'one_day': (test_base_date, test_base_date + timedelta(days=1)),
    # 'one_week': (test_base_date, test_base_date + timedelta(weeks=1)),
    # 'one_month': (test_base_date, test_base_date + timedelta(days=30)),
    # 'two_months': (test_base_date, test_base_date + timedelta(days=30*2)),
    # 'three_months': (test_base_date, test_base_date + timedelta(days=30*3)),
    'five_months': (test_base_date, test_base_date + timedelta(days=30*5))
}

In [28]:
all_intervals_df = []
for label, (start, end) in test_intervals.items():
    print(f"Initiating processing of interval {start} to {end}")
    
    interval_df = process_interval(start, end)
    
    all_intervals_df.append(interval_df)

test_df = pd.concat(all_intervals_df)

Initiating processing of interval 2023-06-01 00:00:00 to 2023-10-29 00:00:00


In [29]:
test_df = test_df.reset_index()

test_df['station_complex_id'] = test_df['station_complex_id'].astype(str)
test_df = test_df.drop('index', axis=1)

In [30]:
new_predictions = predict_model(final_model, data=test_df)

print(new_predictions.tail())

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,10.5438,634.861,25.1964,0.8404,0.379,0.3655


       period  weekday station_complex_id   latitude  longitude  ridership  \
23905       4        6                 97  40.697208 -73.935654   8.279368   
23906       4        6                 98  40.700260 -73.941124   8.192062   
23907       4        6                 99  40.703869 -73.947411   6.384952   
23908       4        6              TRAM1  40.761337 -73.964157  20.083488   
23909       4        6              TRAM2  40.757339 -73.954117  15.183368   

       prediction_label  
23905          9.470105  
23906         21.131268  
23907          7.489213  
23908         23.852886  
23909         18.837566  


In [32]:
print(intervals)

{'five_months': (datetime.datetime(2023, 1, 1, 0, 0), datetime.datetime(2023, 5, 31, 0, 0))}
