In [6]:
import pandas as pd
import os

# Define our standard columns (The "Schema")
FINAL_COLUMNS = ['Date', 'City', 'Route', 'Mode', 'DayType', 'Ridership']

# ---------------------------------------------------------
# STORY 2.3.1 — Philadelphia By Mode
# ---------------------------------------------------------
print("Processing Philadelphia Mode data...")
septa_mode_raw = pd.read_csv(R"../data/raw/Average_Daily_Ridership_By_Mode - City of Philadelphia.csv")

# 1. Add City identity
septa_mode_raw['City'] = 'Philadelphia'

# 2. Add 'Total' to the Route column (since this is by mode, not specific route)
septa_mode_raw['Route'] = 'Total'

# 3. Create a clean Date column (YYYY-MM-01)
septa_mode_raw['Date'] = septa_mode_raw.apply(
    lambda row: f"{int(row['Calendar_Year'])}-{int(row['Calendar_Month']):02d}-01", axis=1
)

# 4. Standardize DayType and rename Ridership
septa_mode_raw['DayType'] = 'W' # Standard Weekday Average
septa_mode_raw = septa_mode_raw.rename(columns={'Average_Daily_Ridership': 'Ridership'})

# 5. Filter for final schema
septa_mode_clean = septa_mode_raw[FINAL_COLUMNS]
septa_mode_clean.to_csv(R"../data/processed/septa_ridership_mode_clean.csv", index=False)


# ---------------------------------------------------------
# STORY 2.3.2 — Philadelphia By Route
# ---------------------------------------------------------
print("Processing Philadelphia Route data...")
septa_route_raw = pd.read_csv(R"../data/raw/Average_Daily_Ridership_By_Route - City of Philadelphia.csv")

# 1. Add City and Mode identity
septa_route_raw['City'] = 'Philadelphia'
septa_route_raw['Mode'] = 'Bus'  # Numbered routes in this dataset are predominantly Bus

# 2. Create clean Date
septa_route_raw['Date'] = septa_route_raw.apply(
    lambda row: f"{int(row['Calendar_Year'])}-{int(row['Calendar_Month']):02d}-01", axis=1
)

# 3. Standardize DayType and rename Ridership
septa_route_raw['DayType'] = 'W'
septa_route_raw = septa_route_raw.rename(columns={'Average_Daily_Ridership': 'Ridership'})

# 4. Filter for final schema
septa_route_clean = septa_route_raw[FINAL_COLUMNS]
septa_route_clean.to_csv(R"../data/processed/septa_ridership_route_clean.csv", index=False)

print("\n✅ Success! Two cleaned files created:")
print("- septa_ridership_mode_clean.csv")
print("- septa_ridership_route_clean.csv")

Processing Philadelphia Mode data...
Processing Philadelphia Route data...

✅ Success! Two cleaned files created:
- septa_ridership_mode_clean.csv
- septa_ridership_route_clean.csv
