In [5]:
ls

 Volume in drive C is OS
 Volume Serial Number is 6A7C-0C86

 Directory of C:\Users\lizie\OneDrive\Desktop\CMU\25spring\ML\group project\90803ML-FinalProject

04/15/2025  12:27 AM    <DIR>          .
04/14/2025  11:45 PM    <DIR>          ..
04/15/2025  12:25 AM    <DIR>          .ipynb_checkpoints
04/14/2025  11:45 PM    <DIR>          Data
04/14/2025  11:50 PM            47,719 Data prep and baseline models.ipynb
04/14/2025  11:45 PM            26,427 EDA.ipynb
04/15/2025  12:27 AM             1,395 feature generation and model.ipynb
04/15/2025  12:17 AM            54,752 initial_data_import.ipynb
04/14/2025  11:45 PM                92 README.md
               5 File(s)        130,385 bytes
               4 Dir(s)  741,817,737,216 bytes free


In [10]:
import os
os.chdir(r"C:\Users\lizie\OneDrive\Desktop\CMU\25spring\ML\group project\90803ML-FinalProject\Data")

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [12]:
# In the following code, I'm trying XGBoost and LightGBM model for raw data along with baseline models and see which performs better

In [13]:
ridership = pd.read_csv("ridership.csv")

# Same X and y
X = ridership.drop(['avg_riders', 'route_full_name'], axis=1)
y = ridership['avg_riders']

# Drop sparse columns 
sparse_columns = list(X.columns[X.nunique() / len(X) * 100 < 0.01])
X = X.drop(sparse_columns, axis=1)

# Identify column types
int_cols = X.select_dtypes(include='int').columns.tolist()
float_cols = X.select_dtypes(include='float').columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()

# Preprocessing
numeric_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_preprocessor, int_cols + float_cols),
    ('cat', categorical_preprocessor, cat_cols)
])

# Define XGBoost and LightGBM pipelines
models = {
    "Baseline (Mean)": DummyRegressor(strategy='mean'),
    "Linear Regression": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    "Random Forest": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=25, random_state=47, n_jobs=-1))
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
    ]),
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
    ])
}

# Cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=-1)
    rmse_scores = -scores
    print(f"{name} | Mean RMSE: {rmse_scores.mean():.3f} | Std: {rmse_scores.std():.3f}")

Baseline (Mean) | Mean RMSE: 1393.617 | Std: 41.610
Linear Regression | Mean RMSE: 692.996 | Std: 22.712
Random Forest | Mean RMSE: 236.592 | Std: 12.999
XGBoost | Mean RMSE: 414.290 | Std: 10.813
LightGBM | Mean RMSE: 264.560 | Std: 13.764


In [None]:
# Seems like Random Forest behaves best, but we can still explore LightGBM further as well

In [None]:
# Then explore feature generation

In [46]:
ridership = pd.read_csv("ridership.csv")

# Same X and y
# X = ridership.drop(['_id', 'route', 'avg_riders', 'route_full_name', 'year_month'], axis=1)
X = ridership.drop(['_id','avg_riders'], axis=1)
y = ridership['avg_riders']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Drop sparse columns 
sparse_columns = list(X.columns[X.nunique() / len(X) * 100 < 0.01])
print(f"Sparse columns: {sparse_columns}")

# Identify column types
int_cols = X.select_dtypes(include='int').columns.tolist()
float_cols = X.select_dtypes(include='float').columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()
print(f"Int columns: {int_cols}")
print(f"Float columns: {float_cols}")
print(f"Cat columns: {cat_cols}")

# Preprocessing
numeric_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_preprocessor, int_cols + float_cols),
    ('cat', categorical_preprocessor, cat_cols)
])


# Define XGBoost and LightGBM pipelines
models = {
    "Baseline (Mean)": DummyRegressor(strategy='mean'),
    "Linear Regression": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    "Random Forest": Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=25, random_state=47, n_jobs=8))
    ]),
    'XGBoost': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(n_estimators=1000, max_depth=10, learning_rate=0.1, random_state=42))
    ]),
    'LightGBM': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LGBMRegressor(n_estimators=1000, learning_rate=0.1, random_state=42))
    ])
}

# Cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=8)
    rmse_scores = -scores
    print(f"{name} | Mean RMSE: {rmse_scores.mean():.3f} | Std: {rmse_scores.std():.3f}")

Sparse columns: []
Int columns: ['year_month', 'day_count']
Float columns: ['total_precip', 'avg_temp']
Cat columns: ['route', 'ridership_route_code', 'route_full_name', 'current_garage', 'mode', 'month_start', 'day_type']
Baseline (Mean) | Mean RMSE: 1393.617 | Std: 41.610
Linear Regression | Mean RMSE: 694.691 | Std: 23.233
Random Forest | Mean RMSE: 238.076 | Std: 15.704
XGBoost | Mean RMSE: 198.764 | Std: 16.186
LightGBM | Mean RMSE: 187.354 | Std: 9.442


In [47]:
# identify important features
model.fit(X, y)
importances = model.named_steps['regressor'].feature_importances_
feature_names = model.named_steps['preprocessor'].get_feature_names_out()
feat_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
print(feat_df.sort_values(by='importance', ascending=False).head(20))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1104
[LightGBM] [Info] Number of data points in the train set: 22317, number of used features: 420
[LightGBM] [Info] Start training from score 1189.182466
                                     feature  importance
0                            num__year_month        7793
3                              num__avg_temp        3270
1                             num__day_count        2802
2                          num__total_precip        2751
442                       cat__day_type_SUN.         839
441                       cat__day_type_SAT.         806
338         cat__current_garage_East Liberty         716
342         cat__current_garage_West Mifflin         617
340                 cat__current_garage_Ross         501
337              cat