In [1]:
# Load packages
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import matplotlib.pyplot as plt

# NOTE: The first few code sections can take a little time to run.
# I suggest running the code once with FIRST_RUN = True and changing it to False.
# This change will skip the first sections and run from a previously generated csv file.
FIRST_RUN = False

In [None]:
if FIRST_RUN:
    # Load in the trips csv file
    # NOTE: As written, this will search back one directory level from the current file and in a "Data" folder containing the datasets. The reference will be different unless you define the same directory structure.
    trip_df = pd.read_csv("C:/Users/nicka/OneDrive - University of Nebraska-Lincoln/CIVE461/F22/Data/trips_all.csv")
    # We want to be able to filter by home location (not given in trip data) and include hh and person details so need hh_all.csv/per_all.csv
    hh_df = pd.read_csv("C:/Users/nicka/OneDrive - University of Nebraska-Lincoln/CIVE461/F22/Data/hh_all.csv")
    per_df = pd.read_csv("C:/Users/nicka/OneDrive - University of Nebraska-Lincoln/CIVE461/F22/Data/per_all.csv")
    # We can use household weights to generate some approximate zonal statistics
    hh_wgt_df = pd.read_csv("C:/Users/nicka/OneDrive - University of Nebraska-Lincoln/CIVE461/F22/Data/hh_wgt_all.csv")

    # Merge on UID, which is a combination of a household id and a dataset id 
    trip_df = trip_df.merge(hh_df.loc[:,["UID","hh_cbsa"]], on="UID")

In [None]:
# Note: You will need to change hh_cbsa to equal your chosen region. 
if FIRST_RUN:
    # Filter data for your chosen area
    # I am using Sacramento, CA here
    my_trips = trip_df.loc[(trip_df.hh_cbsa=="40900")]
    # print the shape (rows,columns) of my_trips to confirm you have enough records for analysis
    print(my_trips.shape)

In [None]:
if FIRST_RUN:
    # Function to generate new columns based on trip information
    def f(x):
        d = {}
        d['UID'] = x['UID'].mean()
        d['gasprice'] = x['gasprice'].mean()
        d['mean_numontrp'] = x['numontrp'].mean()
        d['trp_ct'] = x['UID'].count()
        return pd.Series(d, index=['UID', 'gasprice', 'mean_numontrp', 'trp_ct'])

    # Function to generate new columns based on person information. Note that the model is for households, so this function gives us back average statistics for persons.
    def g(x):
        d = {}
        d['UID'] = x['UID'].mean()
        d['mean_hh_trips'] = x['cnttdtr'].mean()
        d['mean_edu'] = x['educ'].mean()
        d['mean_race'] = x['r_race'].mean()
        d['mean_sex'] = x['r_sex'].mean()
        return pd.Series(d, index=['UID','mean_hh_trips', 'mean_edu', 'mean_race', 'mean_sex'])

    my_tot_trip = my_trips.groupby(["sampno","trippurp"]).apply(f).reset_index()

    # Merge on UID, which is a combination of a household id and a dataset id 
    cols_to_use = hh_df.columns.difference(my_tot_trip.columns).tolist()
    cols_to_use.append('UID')
    my_tot_trip = my_tot_trip.merge(hh_df[cols_to_use], on="UID")

    grp_per = per_df.groupby(["sampno","perno"]).apply(g).reset_index()
    cols_to_use = grp_per.columns.difference(my_tot_trip.columns).tolist()
    cols_to_use.append('UID')
    my_tot_trip = my_tot_trip.merge(grp_per[cols_to_use], on="UID")

    cols_to_use = hh_wgt_df.columns.difference(my_tot_trip.columns).tolist()
    cols_to_use.append('UID')
    my_tot_trip = my_tot_trip.merge(hh_wgt_df[cols_to_use], on="UID")
    my_tot_trip

In [None]:
# Note: I suggest you open your dataset in Excel to verify there aren't any anomolies (e.g., non-numeric trip counts)
if FIRST_RUN:
    my_tot_trip.to_csv("my_hh_trip_data.csv",index=False)

In [None]:
if not FIRST_RUN:
    my_tot_trip = pd.read_csv("C:/Users/nicka/OneDrive - University of Nebraska-Lincoln/CIVE461/F22/Assignment Prep/my_hh_trip_data.csv")
    my_tot_trip

In [None]:
# I suggest running a constant only model to confirm everything is working as expected. 
# In this case, we see that the constant is positive meaning trips are positive and 2.7 on average - good.
mod = smf.ols(formula='trp_ct ~ 1', data=my_tot_trip)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 trp_ct   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                       nan
Date:                Mon, 10 Oct 2022   Prob (F-statistic):                nan
Time:                        18:14:41   Log-Likelihood:                -38714.
No. Observations:               18883   AIC:                         7.743e+04
Df Residuals:                   18882   BIC:                         7.744e+04
Df Model:                           0                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.7007      0.014    197.402      0.0

# Home-Based Non-Work Trip Model

In [None]:
hbnw_filt = ((my_tot_trip.trippurp=="HBO") | (my_tot_trip.trippurp=="HBSHOP") | (my_tot_trip.trippurp=="HBSOCREC"))

hbnw_trips = my_tot_trip.loc[hbnw_filt]

new_hbnw_trips = hbnw_trips.loc[hbnw_trips.hhfaminc>0]

new_hbnw_trips.hhrace.get_dummies()

mod = smf.wls(formula='trp_ct ~ 1 + hhsize', data=hbnw_trips, weights=hbnw_trips.wthhfin)
res = mod.fit()
print(res.summary())

AttributeError: 'DataFrame' object has no attribute 'hhrace'

# Home-Based Work Trip Model

- Describe model setup here...

In [None]:
hbw_filt = my_tot_trip.trippurp=="HBW"

hbw_trips = my_tot_trip.loc[hbw_filt]

# WLS is "weighted least squares". It uses survey weights that capture non-represenativeness/bias in survey respondent characteristics.
# Characteristics might be that the age distribution in respondents doesn't exactly match that in the US population.
# WLS is a slight adjustment on OLS, or "ordinary least squares" that re-weights each observations contribution to paramter estimation.
mod = smf.wls(formula='trp_ct ~ 1 + hhsize + hhvehcnt', data=hbw_trips, weights=hbw_trips.wthhfin)

res = mod.fit()
print(res.summary())

- Describe model results here...

# Cross-Classification Model
- Describe cross-classification model here

In [None]:
# This function takes a Pandas dataframe as an input and requires the user specify a cell value (values) + row/column labels to aggregate against.
cross_class = pd.pivot_table(hbnw_trips,values="trp_ct",index="hhsize",columns="hhvehcnt",aggfunc="mean",fill_value=0)

#new_hbnw_trips  = hbnw_trips
hbnw_trips.loc[hbnw_trips.hhsize>4,"new_hhsize"] = 4

cross_class

In [None]:
# You will need to combine row/column categories to obtain a reasonable count in each cell.
# Combining can be done by defining a new column and using if/else statements with filtering to create new household size and vehicle count variables.
# E.g., you can change all household sizes >3 to be a single category called "4+ household members"
cross_class_ct = pd.pivot_table(hbnw_trips,values="trp_ct",index="hhsize",columns="hhvehcnt",aggfunc="count",fill_value=0)
cross_class_ct

# Model Comparison
- Compare the models here...

# NOTE: The below code is For information purposes ONLY.
# You do not need to run this model.
# You should delete the below text/code from your submission file

In [None]:
plt.hist(hbw_trips['trp_ct'], len(hbw_trips['trp_ct'].unique()), density=True, facecolor='g', alpha=0.75)
plt.xlabel('Trip County');
plt.ylabel('Frequency');
plt.title('Histogram of HH Trip Frequency');

In [None]:
# The data follow a count process: i.e., 1, 2, 3, 4, etc.
# Therefore, we should really be using a model based on a count process rather than the normal distribution implicit in the OLS regression.
# The negative binomial model is a standard approach to county data regression when the data exhibit skewness.
mod_nbin = smf.negativebinomial(formula='trp_ct ~ 1 + hhsize', data=hbw_trips, weights=hbw_trips.wthhfin)

res_nbin = mod_nbin.fit()
print(res_nbin.summary())

In [None]:
# The above model results (alpha) suggest that the process is not negative binomial distributed - i.e., does not exhibit skewness.
# We'll run the model again using a Poisson regression that assumes mean = variance.
mod_poi = smf.poisson(formula='trp_ct ~ 1 + hhsize', data=hbw_trips, weights=hbw_trips.wthhfin)

res_poi = mod_poi.fit()
print(res_poi.summary())