# Transportation Data Cleaning Notebook
This notebook cleans the ridership data for Philadelphia and Chicago as part of the urban transportation agency project.

In [161]:
import pandas as pd
from rdflib import Graph
import os

## 1. Cleaning Philadelphia Ridership by Mode

In [197]:
# Load the file
df_philly_mode = R"../data/raw/Average_Daily_Ridership_By_Mode - City of Philadelphia.csv"
df_philly_mode = pd.read_csv(df_philly_mode)

# Inspection
print("--- Philly Mode Info ---")
print(df_philly_mode.info())
display(df_philly_mode.head())

# Cleaning
df_philly_mode = df_philly_mode.drop_duplicates()
df_philly_mode = df_philly_mode.fillna(method='ffill')  # Forward fill any missing values

--- Philly Mode Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Calendar_Year            492 non-null    int64 
 1   Calendar_Month           492 non-null    int64 
 2   Mode                     492 non-null    object
 3   Average_Daily_Ridership  492 non-null    int64 
 4   Source                   492 non-null    object
 5   ObjectId                 492 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 23.2+ KB
None


Unnamed: 0,Calendar_Year,Calendar_Month,Mode,Average_Daily_Ridership,Source,ObjectId
0,2019,1,Bus,459160,APC,1
1,2019,1,CCT,4294,Revenue,2
2,2019,1,Heavy Rail,296709,Revenue,3
3,2019,1,Regional Rail,122856,Revenue,4
4,2019,1,Trackless Trolley,16853,APC,5


  df_philly_mode = df_philly_mode.fillna(method='ffill')  # Forward fill any missing values


In [198]:
# Add city column
df_philly_mode['city'] = 'Philadelphia'

# Drop unnecessary columns
df_philly_mode.drop(columns=['Source', 'ObjectId'], inplace=True)

# Rename columns for consistency
df_philly_mode = df_philly_mode.rename(columns={
    'Calendar_Year': 'year',
    'Calendar_Month': 'month',
    'Average_Daily_Ridership': 'average_monthly_ridership',
    'Mode': 'mode'
})
df_philly_mode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   year                       492 non-null    int64 
 1   month                      492 non-null    int64 
 2   mode                       492 non-null    object
 3   average_monthly_ridership  492 non-null    int64 
 4   city                       492 non-null    object
dtypes: int64(3), object(2)
memory usage: 19.3+ KB


In [199]:
df_philly_mode.head()

Unnamed: 0,year,month,mode,average_monthly_ridership,city
0,2019,1,Bus,459160,Philadelphia
1,2019,1,CCT,4294,Philadelphia
2,2019,1,Heavy Rail,296709,Philadelphia
3,2019,1,Regional Rail,122856,Philadelphia
4,2019,1,Trackless Trolley,16853,Philadelphia


In [201]:
# Replace mode values for consistency
df_philly_mode['mode'] = df_philly_mode['mode'].replace({
    'Heavy Rail': 'rail_boardings',
    'Regional Rail': 'rail_boardings',
    'Bus': 'bus'
})

# Remove rows from mode
df_philly_mode = df_philly_mode[df_philly_mode["mode"].isin(["bus", "rail_boardings"])]

In [202]:
# Save cleaned data
df_philly_mode.to_csv(R"../data/processed/Cleaned_Philly_Ridership_Mode.csv", index=False)
print("Saved: Cleaned_Philly_Ridership_Mode.csv")

Saved: Cleaned_Philly_Ridership_Mode.csv


## 2. Cleaning Philadelphia Ridership by Route

In [164]:
# Load the file
df_philly_route = R"../data/raw/Average_Daily_Ridership_By_Route - City of Philadelphia.csv"
df_philly_route = pd.read_csv(df_philly_route)

# Inspection
print("--- Philly Route Info ---")
print(df_philly_route.info())
display(df_philly_route.head())

# Cleaning
df_philly_route = df_philly_route.drop_duplicates()
df_philly_route = df_philly_route.fillna(method='ffill')

--- Philly Route Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10994 entries, 0 to 10993
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Calendar_Year            10994 non-null  int64 
 1   Calendar_Month           10994 non-null  int64 
 2   Route                    10994 non-null  object
 3   Average_Daily_Ridership  10994 non-null  int64 
 4   Source                   10994 non-null  object
 5   ObjectId                 10994 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 515.5+ KB
None


Unnamed: 0,Calendar_Year,Calendar_Month,Route,Average_Daily_Ridership,Source,ObjectId
0,2019,8,59,3811,APC,2
1,2019,8,60,8550,APC,3
2,2019,8,61,3563,APC,4
3,2019,8,62,226,APC,5
4,2019,8,64,4335,APC,6


  df_philly_route = df_philly_route.fillna(method='ffill')


In [165]:
# Add city column
df_philly_route['city'] = 'Philadelphia'

# Rename columns for consistency
df_philly_route.rename(columns={
    'Calendar_Year': 'year',
    'Calendar_Month': 'month',
    'Average_Daily_Ridership': 'riders',
    'Route': 'route'
}, inplace=True)

# Drop unnecessary columns
df_philly_route.drop(columns=['Source', 'ObjectId'], inplace=True)

df_philly_route.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10994 entries, 0 to 10993
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    10994 non-null  int64 
 1   month   10994 non-null  int64 
 2   route   10994 non-null  object
 3   riders  10994 non-null  int64 
 4   city    10994 non-null  object
dtypes: int64(3), object(2)
memory usage: 429.6+ KB


In [166]:
# Save cleaned data
df_philly_route.to_csv(R"../data/processed/Cleaned_Philly_Ridership_Route.csv", index=False)
print("Saved: Cleaned_Philly_Ridership_Route.csv")

Saved: Cleaned_Philly_Ridership_Route.csv


## 3. Processing Chicago Ridership Data
Transforming the 'service_date' into Year, Month, and Day columns.

In [180]:
# Load the Chicago Excel file
# Note: Ensure the file extension matches your actual file (e.g., .xlsx)
df_chicago_mode = R"../data/raw/cta-ridership-daily-boarding-totals-20260203-69820a3f9df63091665572.xlsx"

df_chicago_mode = pd.read_excel(df_chicago_mode)
    
# Inspection
print("--- Chicago Data Info ---")
print(df_chicago_mode.info())
display(df_chicago_mode.head())

# Transformation: Convert service_date and extract features
df_chicago_mode['service_date'] = pd.to_datetime(df_chicago_mode['service_date'])
df_chicago_mode['year'] = df_chicago_mode['service_date'].dt.year
df_chicago_mode['month'] = df_chicago_mode['service_date'].dt.month

# Arrange columns
modify_columns = ['service_date', 'year', 'month', 'day_type', 'bus', 'rail_boardings', 'total_rides']
df_chicago_mode = df_chicago_mode[modify_columns]

# Convert datetime column to date only
df_chicago_mode["service_date"] = pd.to_datetime(df_chicago_mode["service_date"]).dt.date

--- Chicago Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9100 entries, 0 to 9099
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   service_date    9100 non-null   datetime64[ns]
 1   day_type        9100 non-null   object        
 2   bus             9100 non-null   int64         
 3   rail_boardings  9100 non-null   int64         
 4   total_rides     9100 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 355.6+ KB
None


Unnamed: 0,service_date,day_type,bus,rail_boardings,total_rides
0,2001-01-01,U,297192,126455,423647
1,2001-01-02,W,780827,501952,1282779
2,2001-01-03,W,824923,536432,1361355
3,2001-01-04,W,870021,550011,1420032
4,2001-01-05,W,890426,557917,1448343


In [181]:
# Filter out rows with invalid dates and keep only data from 2019 onwards
df_chicago_mode["service_date"] = pd.to_datetime(df_chicago_mode["service_date"], errors="coerce")
df_chicago_mode = df_chicago_mode[df_chicago_mode["service_date"].dt.year >= 2019]
df_chicago_mode.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2526 entries, 6574 to 9099
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   service_date    2526 non-null   datetime64[ns]
 1   year            2526 non-null   int32         
 2   month           2526 non-null   int32         
 3   day_type        2526 non-null   object        
 4   bus             2526 non-null   int64         
 5   rail_boardings  2526 non-null   int64         
 6   total_rides     2526 non-null   int64         
dtypes: datetime64[ns](1), int32(2), int64(3), object(1)
memory usage: 138.1+ KB


In [182]:
# Clean column names (removes hidden spaces)
df_chicago_mode.columns = df_chicago_mode.columns.str.strip()

# Ensure service_date is datetime
df_chicago_mode["service_date"] = pd.to_datetime(df_chicago_mode["service_date"], errors="coerce")

cols = set(df_chicago_mode.columns)

# CASE 1: Data is still WIDE (has bus + rail_boardings) -> melt it
if {"bus", "rail_boardings"}.issubset(cols):

    # Pick a value_name that doesn't already exist
    value_col = "ridership" if "ridership" not in cols else "ridership_value"

    df_chicago_mode = pd.melt(
        df_chicago_mode,
        id_vars=["service_date", "day_type", "year", "month"],
        value_vars=["bus", "rail_boardings"],
        var_name="Mode",
        value_name=value_col
    )

    ridership_col = value_col

# CASE 2: Data is already LONG (already has Mode + ridership) -> no melt needed
elif {"Mode", "ridership"}.issubset(cols):
    df_daily_long = df_chicago_mode.copy()
    ridership_col = "ridership"

# Otherwise: unexpected schema
else:
    raise ValueError(f"Unexpected columns: {df_chicago_mode.columns.tolist()}")

# Monthly average ridership per mode
df_chicago_mode = (
    df_chicago_mode
    .groupby(["year", "month", "Mode"], as_index=False)
    .agg(average_monthly_ridership=(ridership_col, "mean"))
)
# Add city column
df_chicago_mode['city'] = 'Chicago'

# Rename columns for consistency
df_chicago_mode.rename(columns={'Mode': 'mode'}, inplace=True)

print(df_chicago_mode.head())
print(df_chicago_mode.info())


   year  month            mode  average_monthly_ridership     city
0  2019      1             bus              563028.000000  Chicago
1  2019      1  rail_boardings              518349.838710  Chicago
2  2019      2             bus              652809.857143  Chicago
3  2019      2  rail_boardings              584176.464286  Chicago
4  2019      3             bus              653778.935484  Chicago
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   year                       166 non-null    int32  
 1   month                      166 non-null    int32  
 2   mode                       166 non-null    object 
 3   average_monthly_ridership  166 non-null    float64
 4   city                       166 non-null    object 
dtypes: float64(1), int32(2), object(2)
memory usage: 5.3+ KB
None


In [183]:
# Save the cleaned and transformed data
df_chicago_mode.to_csv(R"../data/processed/Cleaned_Chicago_Ridership_Mode.csv", index=False)

## 4. CTA RDF â†’ CSV (Chicago)

In [171]:
# daily RDF to DataFrame
def rdf_daily_to_df(file_path):
    g = Graph()
    g.parse(file_path, format="xml")

    rows = {}

    for s, p, o in g:
        s = str(s)
        p = str(p)
        o = str(o)

        if s not in rows:
            rows[s] = {
                "subject": s,
                "city": "Chicago"
            }

        col = p.split("/")[-1]

        if col == "route":
            rows[s]["route"] = o
        elif col == "date":
            rows[s]["date"] = o
        elif col == "daytype":
            rows[s]["daytype"] = o
        elif col == "rides":
            rows[s]["ridership"] = o

    df = pd.DataFrame(rows.values())

    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["ridership"] = pd.to_numeric(df["ridership"], errors="coerce")

    return df

# Process all RDF files in the directory
dfs = []
for file in os.listdir(R"../data/raw/RDF_CTA/"):
    if file.endswith(".rdf"):
        full_path = os.path.join(R"../data/raw/RDF_CTA/", file)
        print(f"Reading: {file}")
        df_temp = rdf_daily_to_df(full_path)
        dfs.append(df_temp)

# Combine all daily DataFrames into one
df_chicago_route = pd.concat(dfs, ignore_index=True)

Reading: rdf_CTA__Ridership__Daily_by_Route_routes_1.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_10.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_11.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_12.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_13.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_14.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_15.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_16.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_17.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_18.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_19.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_2.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_3.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_4.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_5.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_6.rdf
Reading: rdf_CTA__Ridership__Daily_by_Route_routes_7.rdf
Reading: rdf_CTA__Rid

In [172]:
# Transformation: Convert service_date and extract features
df_chicago_route['date'] = pd.to_datetime(df_chicago_route['date'], errors="coerce")
df_chicago_route['year'] = df_chicago_route['date'].dt.year
df_chicago_route['month'] = df_chicago_route['date'].dt.month

# Filter out rows with invalid dates and keep only data from 2019 onwards
df_chicago_route = df_chicago_route[df_chicago_route["date"].dt.year >= 2019]

# Arrange columns
modify_columns = ['subject', 'city', 'date', 'year', 'month', 'daytype', 'route', 'ridership']
df_chicago_route = df_chicago_route[modify_columns]

In [173]:
df_chicago_route.head()

Unnamed: 0,subject,city,date,year,month,daytype,route,ridership
0,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,2020-09-27,2020,9,U,103,400
7,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,2023-04-26,2023,4,W,1001,21
16,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,2019-02-16,2019,2,A,1001,1057
24,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,2023-03-03,2023,3,W,100,293
30,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,2022-08-01,2022,8,W,100,409


In [174]:
df_chicago_route.info()

<class 'pandas.core.frame.DataFrame'>
Index: 288929 entries, 0 to 1092467
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   subject    288929 non-null  object        
 1   city       288929 non-null  object        
 2   date       288929 non-null  datetime64[ns]
 3   year       288929 non-null  int32         
 4   month      288929 non-null  int32         
 5   daytype    288929 non-null  object        
 6   route      288929 non-null  object        
 7   ridership  288929 non-null  int64         
dtypes: datetime64[ns](1), int32(2), int64(1), object(4)
memory usage: 17.6+ MB


In [175]:
# Rides daily to Rides monthly
df_chicago_route = (
    df_chicago_route
    .groupby(['year', 'month', 'route'], as_index=False)
    .agg({'ridership': 'mean'})
)

# Rename columns for consistency
df_chicago_route.rename(columns={'ridership': 'riders'}, inplace=True)

# Add city column
df_chicago_route['city'] = 'Chicago'

df_chicago_route.head(15)
df_chicago_route.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10465 entries, 0 to 10464
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    10465 non-null  int32  
 1   month   10465 non-null  int32  
 2   route   10465 non-null  object 
 3   riders  10465 non-null  float64
 4   city    10465 non-null  object 
dtypes: float64(1), int32(2), object(2)
memory usage: 327.2+ KB


In [176]:
# Save cleaned data
df_chicago_route.to_csv(R"../data/processed/Cleaned_Chicago_Ridership_Route.csv", index=False)
print("Saved: Cleaned_Chicago_Ridership_Route.csv")

Saved: Cleaned_Chicago_Ridership_Route.csv
