## Step 2: Feature matrix construction

**Project**: Predicting Bus Ridership \
**Team**: T20 - Eurus Li, Garrett Kent, Kate Munkacsy

### A. Initial setup

In [143]:
## UPDATE BEFORE RUNNING CODE
dir = "path/to/repo/"

In [144]:
import pandas as pd
pd.set_option('display.max_rows', 100)

### B. Construct new features for bus ridership data

In [146]:
data = pd.read_csv(dir + '/Data/ridership.csv')
data.head(10)

Unnamed: 0,_id,route,ridership_route_code,route_full_name,current_garage,mode,month_start,year_month,day_type,avg_riders,day_count,total_precip,avg_temp
0,1,1,1,1 - FREEPORT ROAD,Ross,Bus,2017-01-01,201701,SAT.,969.5,4,3.43,33.6
1,2,4,4,4 - TROY HILL,Ross,Bus,2017-01-01,201701,SAT.,218.25,4,3.43,33.6
2,3,6,6,6 - SPRING HILL,Ross,Bus,2017-01-01,201701,SAT.,495.5,4,3.43,33.6
3,4,8,8,8 - PERRYSVILLE,Ross,Bus,2017-01-01,201701,SAT.,1480.0,4,3.43,33.6
4,5,11,11,11 - FINEVIEW,Ross,Bus,2017-01-01,201701,SAT.,208.0,4,3.43,33.6
5,6,12,12,12 - MCKNIGHT,Ross,Bus,2017-01-01,201701,SAT.,1069.75,4,3.43,33.6
6,7,13,13,13 - BELLEVUE,Ross,Bus,2017-01-01,201701,SAT.,1252.75,4,3.43,33.6
7,8,14,14,14 - OHIO VALLEY,Ross,Bus,2017-01-01,201701,SAT.,478.25,4,3.43,33.6
8,9,15,15,15 - CHARLES,Ross,Bus,2017-01-01,201701,SAT.,672.0,4,3.43,33.6
9,10,16,16,16 - BRIGHTON,Ross,Bus,2017-01-01,201701,SAT.,2070.0,4,3.43,33.6


In [147]:
print(data.shape)
print(data.nunique())

(22317, 13)
_id                     22317
route                     108
ridership_route_code      121
route_full_name           104
current_garage              6
mode                        4
month_start                94
year_month                 94
day_type                    3
avg_riders              17356
day_count                   9
total_precip               85
avg_temp                   86
dtype: int64


In [148]:
## Remove columns with limited variance or repeated information
data.drop(['_id', 'ridership_route_code', 'route_full_name', 'mode', 'month_start', 'day_count'], axis=1, inplace=True)
data.head()

Unnamed: 0,route,current_garage,year_month,day_type,avg_riders,total_precip,avg_temp
0,1,Ross,201701,SAT.,969.5,3.43,33.6
1,4,Ross,201701,SAT.,218.25,3.43,33.6
2,6,Ross,201701,SAT.,495.5,3.43,33.6
3,8,Ross,201701,SAT.,1480.0,3.43,33.6
4,11,Ross,201701,SAT.,208.0,3.43,33.6


In [149]:
data.dtypes

route              object
current_garage     object
year_month          int64
day_type           object
avg_riders        float64
total_precip      float64
avg_temp          float64
dtype: object

In [150]:
## Create numeric month and year columns for date feature
data['year_month'] = data['year_month'].astype(str)
data['year'] = data['year_month'].str[:4].astype(int)
data['month'] = data['year_month'].str[4:].astype(int)

In [162]:
data[['month', 'year_month']].value_counts().sort_index()

month  year_month
1      201701        233
       201801        232
       201901        232
       202001        231
       202101        241
       202201        242
       202301        242
       202401        242
2      201702        234
       201802        232
       201902        235
       202002        232
       202102        241
       202202        242
       202302        242
       202402        243
3      201703        233
       201803        232
       201903        235
       202003        232
       202103        241
       202203        242
       202303        245
       202403        244
4      201704        233
       201804        232
       201904        235
       202004        229
       202104        242
       202204        242
       202304        242
       202404        243
5      201705        233
       201805        232
       201905        234
       202005        229
       202105        241
       202205        242
       202305        242
       

In [164]:
data[['year', 'year_month']].value_counts().sort_index()

year  year_month
2017  201701        233
      201702        234
      201703        233
      201704        233
      201705        233
      201706        233
      201707        234
      201708        233
      201709        233
      201710        232
      201711        232
      201712        232
2018  201801        232
      201802        232
      201803        232
      201804        232
      201805        232
      201806        233
      201807        233
      201808        232
      201809        232
      201810        232
      201811        232
      201812        232
2019  201901        232
      201902        235
      201903        235
      201904        235
      201905        234
      201906        231
      201907        232
      201908        232
      201909        232
      201910        233
      201911        231
      201912        231
2020  202001        231
      202002        232
      202003        232
      202004        229
      202005        229

In [166]:
# Drop year-month combination column and preview updates
data.drop('year_month', axis=1, inplace=True)
data.head()

Unnamed: 0,route,current_garage,day_type,avg_riders,total_precip,avg_temp,year,month
0,1,Ross,SAT.,969.5,3.43,33.6,2017,1
1,4,Ross,SAT.,218.25,3.43,33.6,2017,1
2,6,Ross,SAT.,495.5,3.43,33.6,2017,1
3,8,Ross,SAT.,1480.0,3.43,33.6,2017,1
4,11,Ross,SAT.,208.0,3.43,33.6,2017,1


**Create lagged ridership feature**

In [168]:
# Step 1: Create a datetime column
data['date'] = pd.to_datetime(data[['year', 'month']].assign(day=1))

# Step 2: Sort by route, service_day, and date
data = data.sort_values(['route', 'day_type', 'date'])

# Step 3: Create the lagged column (lagging within route + service_day group)
data['lagged_avg_riders'] = data.groupby(['route', 'day_type'])['avg_riders'].shift(1)

# Optional cleanup
data.drop('date', axis=1, inplace=True)

# Preview updates
data.head()

Unnamed: 0,route,current_garage,day_type,avg_riders,total_precip,avg_temp,year,month,lagged_avg_riders
0,1,Ross,SAT.,969.5,3.43,33.6,2017,1,
227,1,Ross,SAT.,1238.75,3.54,34.6,2017,2,969.5
454,1,Ross,SAT.,1178.25,1.46,40.6,2017,3,1238.75
681,1,Ross,SAT.,1285.2,5.02,39.9,2017,4,1178.25
908,1,Ross,SAT.,1235.5,3.54,57.3,2017,5,1285.2


In [170]:
data.isna().sum()

route                  1
current_garage       189
day_type               0
avg_riders             0
total_precip           0
avg_temp               0
year                   0
month                  0
lagged_avg_riders    267
dtype: int64

In [174]:
# Review observation missing route information
data[data['route'].isna()]

Unnamed: 0,route,current_garage,day_type,avg_riders,total_precip,avg_temp,year,month,lagged_avg_riders
9695,,,WEEKDAY,232.0,2.19,58.2,2020,6,


In [176]:
## Remove single instance with missing route
data.dropna(subset=['route'], inplace=True)
data.head()

Unnamed: 0,route,current_garage,day_type,avg_riders,total_precip,avg_temp,year,month,lagged_avg_riders
0,1,Ross,SAT.,969.5,3.43,33.6,2017,1,
227,1,Ross,SAT.,1238.75,3.54,34.6,2017,2,969.5
454,1,Ross,SAT.,1178.25,1.46,40.6,2017,3,1238.75
681,1,Ross,SAT.,1285.2,5.02,39.9,2017,4,1178.25
908,1,Ross,SAT.,1235.5,3.54,57.3,2017,5,1285.2


In [178]:
data.isna().sum()

route                  0
current_garage       188
day_type               0
avg_riders             0
total_precip           0
avg_temp               0
year                   0
month                  0
lagged_avg_riders    266
dtype: int64

In [180]:
# Construct season feature based on month
# Assuming month is an integer column (1 = Jan, 12 = Dec)
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'

# Apply to dataframe
data['season'] = data['month'].apply(get_season)

data[['month', 'season']].value_counts().sort_index()

month  season
1      Winter    1895
2      Winter    1901
3      Spring    1904
4      Spring    1898
5      Spring    1894
6      Summer    1900
7      Summer    1906
8      Summer    1902
9      Fall      1896
10     Fall      1897
11     Fall      1662
12     Winter    1661
Name: count, dtype: int64

In [182]:
# Preview updates
data.head()

Unnamed: 0,route,current_garage,day_type,avg_riders,total_precip,avg_temp,year,month,lagged_avg_riders,season
0,1,Ross,SAT.,969.5,3.43,33.6,2017,1,,Winter
227,1,Ross,SAT.,1238.75,3.54,34.6,2017,2,969.5,Winter
454,1,Ross,SAT.,1178.25,1.46,40.6,2017,3,1238.75,Spring
681,1,Ross,SAT.,1285.2,5.02,39.9,2017,4,1178.25,Spring
908,1,Ross,SAT.,1235.5,3.54,57.3,2017,5,1285.2,Spring


### C. Incorporate bus stop usage data

In [185]:
stop_data = pd.read_csv(dir + '/Data/stop_usage.csv')
stop_data.head(20)

Unnamed: 0,clever_id,stop_id,stop_name,direction,routes_ser,latitude,longitude,mode,shelter,stop_type,...,route_name,serviceday,total_ons,total_offs,days,avg_ons,avg_offs,total_precip,avg_temp,ZIP
0,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,69,Sat,12.0,0.0,4,3.0,0.0,2.45,71.9,15085
1,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,69,Sun,14.0,0.0,6,2.333333,0.0,2.45,71.9,15085
2,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,69,Weekday,64.0,1.0,20,3.2,0.05,2.45,71.9,15085
3,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,P69,Weekday,39.0,0.0,20,1.95,0.0,2.45,71.9,15085
4,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,69,Sat,11.0,0.0,4,2.75,0.0,3.52,35.4,15085
5,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,69,Sun,12.0,1.0,5,2.4,0.2,3.52,35.4,15085
6,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,69,Weekday,69.0,7.0,22,3.136364,0.318182,3.52,35.4,15085
7,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,P69,Weekday,37.0,0.0,22,1.681818,0.0,3.52,35.4,15085
8,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,69,Sat,7.0,0.0,4,1.75,0.0,5.57,73.2,15085
9,7858,E02110,5TH ST AT CAVIT AVE,Inbound,"69, P69",40.3858,-79.76,Bus,No Shelter,Bus Stop,...,69,Sun,3.0,0.0,5,0.6,0.0,5.57,73.2,15085


**Construct measure of the number of unique stops for each route**

In [190]:
# 1. Split the routes in each row into a list
stop_data['routes_list'] = stop_data['routes_ser'].str.split(',')

# 2. Explode the list so each route has its own row
exploded = stop_data.explode('routes_list')

# 3. Clean up any whitespace
exploded['routes_list'] = exploded['routes_list'].str.strip()

# 4. Drop duplicates to ensure unique stop-route combinations
unique_pairs = exploded[['routes_list', 'stop_id']].drop_duplicates()

unique_pairs['stop_id'] = unique_pairs['stop_id'].astype(str).str.strip()

# 5. Group by route and count unique stop IDs
route_stop_counts = unique_pairs.groupby('routes_list')['stop_id'].nunique().reset_index()

# 6. Rename columns for clarity
route_stop_counts.columns = ['route', 'num_unique_stops']

# Preview result
route_stop_counts.head()

Unnamed: 0,route,num_unique_stops
0,1,224
1,11,61
2,12,113
3,13,137
4,14,138


**Compare routes represented in ridership data and routes represented in stop usage data**

In [193]:
bus_routes = set(route_stop_counts['route'].values)
ridership_routes = set(data['route'].values)

In [195]:
ridership_routes - bus_routes

{'37', '42', '68', '78', 'BLLB', 'BLSV', 'MI', 'MNT', 'MNT1'}

In [197]:
bus_routes - ridership_routes

set()

**Join the number of stops per route to the ridership data**

In [200]:
# Merge on the 'route' column
data = data.merge(route_stop_counts, on='route', how='left')
data.tail()

Unnamed: 0,route,current_garage,day_type,avg_riders,total_precip,avg_temp,year,month,lagged_avg_riders,season,num_unique_stops
22311,Y49,West Mifflin,WEEKDAY,695.4,5.61,66.5,2024,6,707.636364,Summer,106.0
22312,Y49,West Mifflin,WEEKDAY,724.318182,2.5,72.8,2024,7,695.4,Summer,106.0
22313,Y49,West Mifflin,WEEKDAY,761.681818,3.17,76.5,2024,8,724.318182,Summer,106.0
22314,Y49,West Mifflin,WEEKDAY,724.65,5.12,73.8,2024,9,761.681818,Fall,106.0
22315,Y49,West Mifflin,WEEKDAY,731.782609,1.9,69.4,2024,10,724.65,Fall,106.0


In [202]:
data[['route', 'num_unique_stops']].value_counts().sort_index()

route  num_unique_stops
1      224.0               282
11     61.0                282
12     113.0               282
13     137.0               282
14     138.0               282
15     92.0                282
16     95.0                282
17     102.0               282
18     44.0                 94
19L    85.0                 94
2      239.0               206
20     94.0                191
21     131.0               282
22     66.0                236
24     101.0               282
26     84.0                282
27     105.0               282
28X    78.0                282
29     149.0               190
31     132.0               282
36     95.0                190
38     174.0               282
39     75.0                236
4      106.0               229
40     83.0                282
41     145.0               282
43     66.0                282
44     140.0               282
48     91.0                282
51     143.0               282
51L    55.0                 94
52L    139.0   

In [204]:
stop_data['shelter'].value_counts()

shelter
No Shelter            93549
City of Pittsburgh     6641
PAAC                   5504
Lamar                  1478
Other                   150
Envision Downtown       140
Heffner                  66
Gateway                  44
BCTA                     24
Name: count, dtype: int64

**Construct measure of the number of stops without shelter for each route**

In [206]:
# 1. Filter out rows that say 'No Shelter'
stop_data_with_shelter = stop_data[stop_data['shelter'] != 'No Shelter'].copy()

# 2. Split the routes_ser into a list
stop_data_with_shelter['routes_list'] = stop_data_with_shelter['routes_ser'].str.split(',')

# 3. Explode the list so each route gets its own row
exploded = stop_data_with_shelter.explode('routes_list')

# 4. Clean up whitespace
exploded['routes_list'] = exploded['routes_list'].str.strip()

# 5. Drop duplicate route-stop combinations
unique_route_stop_pairs = exploded[['routes_list', 'stop_id']].drop_duplicates()

# 6. Group by route and count unique stops
route_stop_counts = unique_route_stop_pairs.groupby('routes_list')['stop_id'].nunique().reset_index()

# 7. Rename columns for clarity
route_stop_counts.columns = ['route', 'num_unique_stops_with_shelter']

# Preview result
route_stop_counts.head()

Unnamed: 0,route,num_unique_stops_with_shelter
0,1,17
1,11,5
2,12,11
3,13,14
4,14,11


**Join the number of sheltered stops per route to the ridership data**

In [209]:
# Merge on the 'route' column
data = data.merge(route_stop_counts, on='route', how='left')
data.head()

Unnamed: 0,route,current_garage,day_type,avg_riders,total_precip,avg_temp,year,month,lagged_avg_riders,season,num_unique_stops,num_unique_stops_with_shelter
0,1,Ross,SAT.,969.5,3.43,33.6,2017,1,,Winter,224.0,17.0
1,1,Ross,SAT.,1238.75,3.54,34.6,2017,2,969.5,Winter,224.0,17.0
2,1,Ross,SAT.,1178.25,1.46,40.6,2017,3,1238.75,Spring,224.0,17.0
3,1,Ross,SAT.,1285.2,5.02,39.9,2017,4,1178.25,Spring,224.0,17.0
4,1,Ross,SAT.,1235.5,3.54,57.3,2017,5,1285.2,Spring,224.0,17.0


**Construct indicator year-months impacted by the COVID-19 pandemic**

In [212]:
data['covid'] = (
    ((data['year'] == 2020) & (data['month'] >= 3)) | 
    ((data['year'] == 2021) & (data['month'] <= 6))
).astype(int)

data['covid'].value_counts()

covid
0    18545
1     3771
Name: count, dtype: int64

In [214]:
# Preview updates
data.head()

Unnamed: 0,route,current_garage,day_type,avg_riders,total_precip,avg_temp,year,month,lagged_avg_riders,season,num_unique_stops,num_unique_stops_with_shelter,covid
0,1,Ross,SAT.,969.5,3.43,33.6,2017,1,,Winter,224.0,17.0,0
1,1,Ross,SAT.,1238.75,3.54,34.6,2017,2,969.5,Winter,224.0,17.0,0
2,1,Ross,SAT.,1178.25,1.46,40.6,2017,3,1238.75,Spring,224.0,17.0,0
3,1,Ross,SAT.,1285.2,5.02,39.9,2017,4,1178.25,Spring,224.0,17.0,0
4,1,Ross,SAT.,1235.5,3.54,57.3,2017,5,1285.2,Spring,224.0,17.0,0


**Rename lagged weather features for clarity**

In [217]:
# Weather columns reflect total precipitation and average temperature for the month before
data = data.rename(columns={
    'total_precip': 'lagged_total_precip',
    'avg_temp': 'lagged_avg_temp'
})

**Update column order so that column to predict ('avg_riders') is at the end**

In [220]:
# Reorder columns
col = 'avg_riders'
data = data[[c for c in data.columns if c != col] + [col]]

# Preview updates
data.head()

Unnamed: 0,route,current_garage,day_type,lagged_total_precip,lagged_avg_temp,year,month,lagged_avg_riders,season,num_unique_stops,num_unique_stops_with_shelter,covid,avg_riders
0,1,Ross,SAT.,3.43,33.6,2017,1,,Winter,224.0,17.0,0,969.5
1,1,Ross,SAT.,3.54,34.6,2017,2,969.5,Winter,224.0,17.0,0,1238.75
2,1,Ross,SAT.,1.46,40.6,2017,3,1238.75,Spring,224.0,17.0,0,1178.25
3,1,Ross,SAT.,5.02,39.9,2017,4,1178.25,Spring,224.0,17.0,0,1285.2
4,1,Ross,SAT.,3.54,57.3,2017,5,1285.2,Spring,224.0,17.0,0,1235.5


### D. Output final feature matrix

In [223]:
data.to_csv(dir + '/Data/feature_matrix.csv', index=False)