In [83]:
import os
import re
import pandas as pd
import pickle
from data_parser import CTATrainDataParser
from datetime import timedelta


In [108]:
data_parser = CTATrainDataParser()

# check if pickle file already exists
# Read all files inside sample_data directory
files = os.listdir('sample_data')
days = {}
for file in files:
    print("Parsing file:", file)
    if file.endswith('.csv'):
        file_path = os.path.join('sample_data', file)
        this_day = data_parser.parse_file(file_path)
        # Store the data for each day
        days[file] = this_day



Parsing file: output_2025-04-12.csv
Parsing file: output_2025-04-09.csv
Parsing file: output_2025-04-06.csv
Parsing file: output_2025-04-05.csv
Parsing file: output_2025-04-02.csv
Parsing file: output_2025-04-08.csv
Parsing file: output_2025-04-13.csv
Parsing file: cta_train_data.pkl
Parsing file: cta_train_noyes.pkl
Parsing file: output_2025-04-01.csv
Parsing file: output_2025-04-11.csv
Parsing file: output_2024-06-03.csv
Parsing file: output_2025-04-07.csv
Parsing file: output_2025-04-04.csv
Parsing file: output_2025-04-10.csv


In [96]:
rows = []
for day, data in days.items():
    for rec in data:
        snap_ts = rec["timestamp"].replace(tzinfo=None)
        for eta in rec["eta_entries"]:
            rows.append({
                "station_id":      eta["station_id"],
                "station_name":    eta["station_name"],
                "stop_description":eta["stop_description"],     # stpDe
                "destination_name":eta["destination_name"],     # destNm
                "is_scheduled":    eta["is_scheduled"],
                "pred_arr":        eta["arrival_time"],
                "run_number":     eta["run_number"],          # run
                "snap_ts":         snap_ts
            })
df = pd.DataFrame(rows).sort_values(["station_id", "pred_arr", "snap_ts"])

In [105]:
noyes_rows = []
for day, data in days.items():
    for rec in data:
        snap_ts = rec["timestamp"].replace(tzinfo=None)

        if rec["station_id"] != "40400":
            continue

        noyes_rows.append((rec["eta_entries"], snap_ts))

# sort the noyes_rows by snap_ts
noyes_rows.sort(key=lambda x: x[1])
print("Number of noyes rows:", len(noyes_rows))

# Save the DataFrame to a pickle file
pickle_file = 'sample_data/cta_train_data.pkl'
with open(pickle_file, 'wb') as f:
    pickle.dump(df, f)
print(f"Data saved to {pickle_file}")
# Save the noyes_rows to a pickle file
noyes_pickle_file = 'sample_data/cta_train_noyes.pkl'
with open(noyes_pickle_file, 'wb') as f:
    pickle.dump(noyes_rows, f)
print(f"Noyes data saved to {noyes_pickle_file}")

Number of noyes rows: 1799
Data saved to sample_data/cta_train_data.pkl
Noyes data saved to sample_data/cta_train_noyes.pkl
