# Data input
- We will first import the necessary Python packages
- We will then load our datasets

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
# # The default matplotlib style is a bit ugly. Seaborn provides a nicer style sheet that integrates with matplotlib.
# sns.set()  # use Seaborn styles. 

In [81]:
df_hh = pd.read_csv('./data/hhpub.csv') # import the household data
df_per = pd.read_csv('./data/perpub.csv') # import the person data
df_trip = pd.read_csv('./data/trippub.csv') # import the trip data
df_veh = pd.read_csv('./data/vehpub.csv') # import the vehicle data

In [None]:
df_hh.head()

# Basic data exploration
Let's first take a look at the trip data and generate some basic statistics.

In [None]:
df_trip.groupby('TRIPPURP').count()['HOUSEID'] # Trip count by purpose

Let's remove trips with the unknown purpose code (-9) and summarize the results as a percent rather than a count.

In [None]:
(df_trip[df_trip.TRIPPURP!="-9"].groupby('TRIPPURP').count()['HOUSEID']/df_trip.shape[0])*100

Trip distance frequency in bins

In [None]:
plt.hist(df_trip.TRPMILES, bins=5);
plt.gca().set(title='Frequency Histogram', ylabel='Frequency');

In [None]:
df_trip.TRPMILES.describe() # Print basic statistics about a column

This is not a great diagram because it shows one large bar and many small bars. This is because the histogram default is equal width bins. We can define custom bins.

In [None]:
plt.hist(df_trip[(df_trip.TRPMILES>0) & (df_trip.TRPMILES<500) ].TRPMILES, bins='fd');
plt.gca().set(title='Frequency Histogram', ylabel='Frequency');

In [None]:
plt.hist(df_trip[(df_trip.TRPMILES>0) & (df_trip.TRPMILES<500) ].TRPMILES, bins=[0,1,2,5,10,25,50,75,100,200,300,400,500]);
plt.gca().set(title='Frequency Histogram', ylabel='Frequency');

In [None]:
plt.hist(df_trip[(df_trip.TRPMILES>0) & (df_trip.TRPMILES<500) ].TRPMILES, bins=list(range(0,20,2)));
plt.gca().set(title='Frequency Histogram', ylabel='Frequency');

# Join Data and Perform More Analysis
Often we are interested in a combination of attributes on trips, persons, households, vehicles, etc. We can join datatables on common columns and perform additional analysis.

In [None]:
cols_to_use = df_per.columns.difference(df_trip.columns) # Filter out only the non-common columns
df_trip = df_trip.join(df_per[cols_to_use],on='PERSONID') # Join trip datatable with person datatable (shorthand for merge() with inner join)
pd.set_option('display.max_rows', None) # Display all rows

df_trip.groupby(['R_AGE','TRPTRANS']).count()['PERSONID']

In [None]:
df_trip.head()

It may be more useful to bin age into 10 year groups and display the modes as text.

In [None]:
df_trip['AGE_GRP'] = pd.cut(df_trip.R_AGE,bins=list(range(5,100,10)))
df_trip['MODE_STR'] = df_trip.TRPTRANS.map({-9:"Not ascertained",-8:"I don't know",-7:"I prefer not to answer",
    1:"Walk",2:"Bicycle",3:"Car",4:"SUV",5:"Van",6:"Pickup truck",7:"Golf cart / Segway",8:"Motorcycle / Moped",
    9:"RV (motor home, ATV, snowmobile)",10:"School bus",11:"Public or commuter bus",12:"Paratransit / Dial-a-ride",
    13:"Private / Charter / Tour / Shuttle bus",14:"City-to-city bus (Greyhound, Megabus)",15:"Amtrak / Commuter rail",
    16:"Subway / elevated / light rail / street car",17:"Taxi / limo (including Uber / Lyft)",
    18:"Rental car (Including Zipcar / Car2Go)",19:"Airplane",20:"Boat / ferry / water taxi",97:"Something Else"
})

df_trip.groupby(['AGE_GRP','MODE_STR']).count()['PERSONID'].unstack()

One easy way to remove outliers is via a robust sigma-clipping operation. The final line in the below code is a robust estimate of the sample mean, where the 0.74 comes from the interquartile range of a Gaussian distribution. We'll remove records with overly long or short travel distances, focusing on trips made by private vehicle.

In [None]:
auto_trips = df_trip.query('(MODE_STR == "Car") | (MODE_STR == "SUV") | (MODE_STR == "Van") | (MODE_STR == "Pikcup truck")')
print("Rows before outlier removal:" , auto_trips.shape[0])
quartiles = np.percentile(auto_trips['TRPMILES'], [25, 50, 75])
mu = quartiles[1]
sig = 0.74 * (quartiles[2] - quartiles[0])
auto_trips_remove_outliers = auto_trips.query('(TRPMILES > @mu - 5 * @sig) & (TRPMILES < @mu + 5 * @sig)')
print("Rows after outlier removal:" , auto_trips_remove_outliers.shape[0])

Let's take a look at the number of trips by day of the week and trip purpose.

In [None]:
DOW_dict = {1:"Sun",2:"Mon",3:"Tues",4:"Wed",5:"Thurs",6:"Fri",7:"Sat"}
df_trip[df_trip!="-9"].sort_values(by="TRAVDAY").replace({"TRAVDAY":DOW_dict}).pivot_table('TDCASEID', index='TRAVDAY',
                    columns='TRIPPURP', aggfunc='count',sort=False).plot()
plt.ylabel('total trips per day by purpose');
plt.xlabel('travel day');