WeGo Public Transit is a public transit system serving the Greater Nashville and Davidson County area. WeGo provides local and regional bus routes, the WeGo Star train service connecting Lebanon to downtown Nashville, along with several other transit services.

In this project, you'll be analyzing the bus spacing to look for patterns and try to identify correlations to controllable or external factors. Specifically, you'll be using a dataset containing information on the headway, or amount of time between vehicle arrivals at a stop. This dataset contains a column HDWY_DEV, which shows the headway deviation. This variable will be negative when bunching has occurred (shorter headway than scheduled) and will be positive for gapping (longer headway than scheduled). Note that you can calculate headway deviation percentage as HDWY_DEV/SCHEDULED_HDWY.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#reading in the 3 csv's
bna_2022 = pd.read_csv("../data/bna_2022.csv")
bna_2022.head()

Unnamed: 0,key,class,expire_time_gmt,obs_id,obs_name,valid_time_gmt,day_ind,temp,wx_icon,icon_extd,...,clds,water_temp,primary_wave_period,primary_wave_height,primary_swell_period,primary_swell_height,primary_swell_direction,secondary_swell_period,secondary_swell_height,secondary_swell_direction
0,KBNA,observation,1641027180,KBNA,Nashville,1641019980,N,71,26,2600,...,OVC,,,,,,,,,
1,KBNA,observation,1641030780,KBNA,Nashville,1641023580,N,72,26,2600,...,OVC,,,,,,,,,
2,KBNA,observation,1641034380,KBNA,Nashville,1641027180,N,73,26,2600,...,OVC,,,,,,,,,
3,KBNA,observation,1641037980,KBNA,Nashville,1641030780,N,73,26,2600,...,OVC,,,,,,,,,
4,KBNA,observation,1641041580,KBNA,Nashville,1641034380,N,73,26,2600,...,OVC,,,,,,,,,


In [3]:
headway_data = pd.read_csv("../data/Headway_Data.csv.txt")
headway_data.head()

Unnamed: 0,CALENDAR_ID,SERVICE_ABBR,ADHERENCE_ID,DATE,ROUTE_ABBR,BLOCK_ABBR,OPERATOR,TRIP_ID,OVERLOAD_ID,ROUTE_DIRECTION_NAME,...,ACTUAL_HDWY,HDWY_DEV,ADJUSTED_EARLY_COUNT,ADJUSTED_LATE_COUNT,ADJUSTED_ONTIME_COUNT,STOP_CANCELLED,PREV_SCHED_STOP_CANCELLED,IS_RELIEF,BLOCK_STOP_ORDER,DWELL_IN_MINS
0,120211101,1,76447164,2021-11-01,7,1704,2088,297750,0,TO DOWNTOWN,...,15.983333,1.983333,0,0,1,0,0.0,0,2,12.65
1,120211101,1,76447165,2021-11-01,7,1704,2088,297750,0,TO DOWNTOWN,...,17.333333,3.333333,0,0,1,0,0.0,0,11,0.0
2,120211101,1,76447166,2021-11-01,7,1704,2088,297750,0,TO DOWNTOWN,...,,,0,1,0,0,,0,23,22.416666
3,120211101,1,76447167,2021-11-01,50,1704,2088,297749,0,TO DOWNTOWN,...,,,0,0,1,0,0.0,0,24,5.766666
4,120211101,1,76447168,2021-11-01,50,1704,2088,297749,0,TO DOWNTOWN,...,,,0,0,1,0,,0,25,0.0


In [4]:
bna_weather = pd.read_csv("../data/bna_weather.csv")
bna_weather.head()

Unnamed: 0,Date,key,class,expire_time_gmt,obs_id,obs_name,valid_time_gmt,day_ind,temp,wx_icon,...,clds,water_temp,primary_wave_period,primary_wave_height,primary_swell_period,primary_swell_height,primary_swell_direction,secondary_swell_period,secondary_swell_height,secondary_swell_direction
0,2021-11-01 00:53:00,KBNA,observation,1635753180,KBNA,Nashville,1635745980,N,51,26,...,OVC,,,,,,,,,
1,2021-11-01 01:30:00,KBNA,observation,1635755400,KBNA,Nashville,1635748200,N,50,29,...,SCT,,,,,,,,,
2,2021-11-01 01:53:00,KBNA,observation,1635756780,KBNA,Nashville,1635749580,N,49,33,...,CLR,,,,,,,,,
3,2021-11-01 02:53:00,KBNA,observation,1635760380,KBNA,Nashville,1635753180,N,48,33,...,CLR,,,,,,,,,
4,2021-11-01 03:53:00,KBNA,observation,1635763980,KBNA,Nashville,1635756780,N,47,33,...,CLR,,,,,,,,,


In [5]:
#kept columns that we wanted
weather_df = bna_weather[['Date', 'temp', 'wx_phrase']]
headway_df = headway_data[['ADHERENCE_ID', 'DATE', 'ROUTE_ABBR', 'BLOCK_ABBR', 'OPERATOR', 'TRIP_ID', 'ROUTE_DIRECTION_NAME', 'TIME_POINT_ABBR', 'ROUTE_STOP_SEQUENCE', 'SCHEDULED_TIME', 'ACTUAL_ARRIVAL_TIME', 'ACTUAL_DEPARTURE_TIME', 'ADHERENCE', 'SCHEDULED_HDWY', 'ACTUAL_HDWY', 'HDWY_DEV']]

In [6]:
headway_df

Unnamed: 0,ADHERENCE_ID,DATE,ROUTE_ABBR,BLOCK_ABBR,OPERATOR,TRIP_ID,ROUTE_DIRECTION_NAME,TIME_POINT_ABBR,ROUTE_STOP_SEQUENCE,SCHEDULED_TIME,ACTUAL_ARRIVAL_TIME,ACTUAL_DEPARTURE_TIME,ADHERENCE,SCHEDULED_HDWY,ACTUAL_HDWY,HDWY_DEV
0,76447164,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,HBHS,4.0,14:10:00,13:59:21,14:12:00,-2.000000,14.0,15.983333,1.983333
1,76447165,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,21BK,3.0,14:20:00,14:23:21,14:23:21,-3.350000,14.0,17.333333,3.333333
2,76447166,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,MCC5_9,2.0,14:39:00,14:36:46,14:59:11,-20.183333,,,
3,76447167,2021-11-01,50,1704,2088,297749,TO DOWNTOWN,MLKS,7.0,15:10:00,15:04:31,15:10:17,-0.283333,,,
4,76447168,2021-11-01,50,1704,2088,297749,TO DOWNTOWN,MCC5_11,5.0,15:27:00,15:16:59,15:16:59,10.016666,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981710,91057724,2022-11-01,50,9302,2355,329980,TO DOWNTOWN,MCC4_20,3.0,15:18:00,15:19:43,15:20:44,-2.733333,,,
1981711,91057851,2022-11-01,50,9950,1880,330003,TO DOWNTOWN,MLKS,10.0,15:20:00,14:57:12,15:07:06,12.900000,10.0,7.633333,-2.366667
1981712,91057852,2022-11-01,50,9950,1880,330003,TO DOWNTOWN,MCC5_11,4.0,15:37:00,15:10:16,15:10:16,26.733333,,,
1981713,91057881,2022-11-01,56,9975,1922,330011,TO DOWNTOWN,MEIG,7.0,15:20:00,14:59:40,15:19:02,0.966666,0.0,,


In [7]:
#changing the column names
headway_df.columns = ['adh_id', 'date', 'rte_abbr', 'blk_abbr', 'opr', 'trip_id', 'rte_dir_name', 'time_pt_abbr', 'rte_stop_seq', 'schd_time', 'act_arrvl_time', 'act_depart', 'adh', 'schd_hdwy', 'act_hdwy', 'hdwy_dev']

In [8]:
headway_df

Unnamed: 0,adh_id,date,rte_abbr,blk_abbr,opr,trip_id,rte_dir_name,time_pt_abbr,rte_stop_seq,schd_time,act_arrvl_time,act_depart,adh,schd_hdwy,act_hdwy,hdwy_dev
0,76447164,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,HBHS,4.0,14:10:00,13:59:21,14:12:00,-2.000000,14.0,15.983333,1.983333
1,76447165,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,21BK,3.0,14:20:00,14:23:21,14:23:21,-3.350000,14.0,17.333333,3.333333
2,76447166,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,MCC5_9,2.0,14:39:00,14:36:46,14:59:11,-20.183333,,,
3,76447167,2021-11-01,50,1704,2088,297749,TO DOWNTOWN,MLKS,7.0,15:10:00,15:04:31,15:10:17,-0.283333,,,
4,76447168,2021-11-01,50,1704,2088,297749,TO DOWNTOWN,MCC5_11,5.0,15:27:00,15:16:59,15:16:59,10.016666,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981710,91057724,2022-11-01,50,9302,2355,329980,TO DOWNTOWN,MCC4_20,3.0,15:18:00,15:19:43,15:20:44,-2.733333,,,
1981711,91057851,2022-11-01,50,9950,1880,330003,TO DOWNTOWN,MLKS,10.0,15:20:00,14:57:12,15:07:06,12.900000,10.0,7.633333,-2.366667
1981712,91057852,2022-11-01,50,9950,1880,330003,TO DOWNTOWN,MCC5_11,4.0,15:37:00,15:10:16,15:10:16,26.733333,,,
1981713,91057881,2022-11-01,56,9975,1922,330011,TO DOWNTOWN,MEIG,7.0,15:20:00,14:59:40,15:19:02,0.966666,0.0,,


In [9]:
#adding new column to calculate the headway deviation percentage
headway_df["hdwy_dev_%"] = ((headway_df["hdwy_dev"] / headway_df["schd_hdwy"])*100)
headway_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  headway_df["hdwy_dev_%"] = ((headway_df["hdwy_dev"] / headway_df["schd_hdwy"])*100)


Unnamed: 0,adh_id,date,rte_abbr,blk_abbr,opr,trip_id,rte_dir_name,time_pt_abbr,rte_stop_seq,schd_time,act_arrvl_time,act_depart,adh,schd_hdwy,act_hdwy,hdwy_dev,hdwy_dev_%
0,76447164,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,HBHS,4.0,14:10:00,13:59:21,14:12:00,-2.000000,14.0,15.983333,1.983333,14.166664
1,76447165,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,21BK,3.0,14:20:00,14:23:21,14:23:21,-3.350000,14.0,17.333333,3.333333,23.809521
2,76447166,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,MCC5_9,2.0,14:39:00,14:36:46,14:59:11,-20.183333,,,,
3,76447167,2021-11-01,50,1704,2088,297749,TO DOWNTOWN,MLKS,7.0,15:10:00,15:04:31,15:10:17,-0.283333,,,,
4,76447168,2021-11-01,50,1704,2088,297749,TO DOWNTOWN,MCC5_11,5.0,15:27:00,15:16:59,15:16:59,10.016666,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981710,91057724,2022-11-01,50,9302,2355,329980,TO DOWNTOWN,MCC4_20,3.0,15:18:00,15:19:43,15:20:44,-2.733333,,,,
1981711,91057851,2022-11-01,50,9950,1880,330003,TO DOWNTOWN,MLKS,10.0,15:20:00,14:57:12,15:07:06,12.900000,10.0,7.633333,-2.366667,-23.666670
1981712,91057852,2022-11-01,50,9950,1880,330003,TO DOWNTOWN,MCC5_11,4.0,15:37:00,15:10:16,15:10:16,26.733333,,,,
1981713,91057881,2022-11-01,56,9975,1922,330011,TO DOWNTOWN,MEIG,7.0,15:20:00,14:59:40,15:19:02,0.966666,0.0,,,


In [10]:
weather_df

Unnamed: 0,Date,temp,wx_phrase
0,2021-11-01 00:53:00,51,Cloudy
1,2021-11-01 01:30:00,50,Partly Cloudy
2,2021-11-01 01:53:00,49,Fair
3,2021-11-01 02:53:00,48,Fair
4,2021-11-01 03:53:00,47,Fair
...,...,...,...
9734,2022-10-31 19:53:00,60,Cloudy
9735,2022-10-31 20:53:00,60,Cloudy
9736,2022-10-31 21:53:00,58,Mostly Cloudy
9737,2022-10-31 22:53:00,60,Mostly Cloudy


In [11]:
#dropping the time from the date column
weather_df['Date'] = weather_df['Date'].str[:10]

weather_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_df['Date'] = weather_df['Date'].str[:10]


Unnamed: 0,Date,temp,wx_phrase
0,2021-11-01,51,Cloudy
1,2021-11-01,50,Partly Cloudy
2,2021-11-01,49,Fair
3,2021-11-01,48,Fair
4,2021-11-01,47,Fair
...,...,...,...
9734,2022-10-31,60,Cloudy
9735,2022-10-31,60,Cloudy
9736,2022-10-31,58,Mostly Cloudy
9737,2022-10-31,60,Mostly Cloudy


In [13]:
#renaming columns in weather
weather_df = weather_df.rename(columns={'Date': 'date', 'wx_phrase': 'wx_condition'})
weather_df = weather_df[['date', 'temp', 'wx_condition']]
weather_df

Unnamed: 0,date,temp,wx_condition
0,2021-11-01,51,Cloudy
1,2021-11-01,50,Partly Cloudy
2,2021-11-01,49,Fair
3,2021-11-01,48,Fair
4,2021-11-01,47,Fair
...,...,...,...
9734,2022-10-31,60,Cloudy
9735,2022-10-31,60,Cloudy
9736,2022-10-31,58,Mostly Cloudy
9737,2022-10-31,60,Mostly Cloudy


In [15]:
#merging the 2 data frames
wego_trips = headway_df.merge(weather_df, on = ['date'], how='outer')
wego_trips.head()

Unnamed: 0,adh_id,date,rte_abbr,blk_abbr,opr,trip_id,rte_dir_name,time_pt_abbr,rte_stop_seq,schd_time,act_arrvl_time,act_depart,adh,schd_hdwy,act_hdwy,hdwy_dev,hdwy_dev_%,temp,wx_condition
0,76447164,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,HBHS,4.0,14:10:00,13:59:21,14:12:00,-2.0,14.0,15.983333,1.983333,14.166664,51.0,Cloudy
1,76447164,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,HBHS,4.0,14:10:00,13:59:21,14:12:00,-2.0,14.0,15.983333,1.983333,14.166664,50.0,Partly Cloudy
2,76447164,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,HBHS,4.0,14:10:00,13:59:21,14:12:00,-2.0,14.0,15.983333,1.983333,14.166664,49.0,Fair
3,76447164,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,HBHS,4.0,14:10:00,13:59:21,14:12:00,-2.0,14.0,15.983333,1.983333,14.166664,48.0,Fair
4,76447164,2021-11-01,7,1704,2088,297750,TO DOWNTOWN,HBHS,4.0,14:10:00,13:59:21,14:12:00,-2.0,14.0,15.983333,1.983333,14.166664,47.0,Fair
