In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
wego = pd.read_csv("../data/Headway Data, 8-1-2023 to 9-30-2023.csv")

wego.head()

## 1.What is the overall on-time performance, and what do the overall distributions of adherence and headway deviation look like?






In [None]:
#Filtering columns
relevant_columns = ['ROUTE_ABBR', 'SCHEDULED_TIME', 'ACTUAL_ARRIVAL_TIME', 'ADHERENCE', 'HDWY_DEV','SCHEDULED_HDWY']
filtered_data = wego.loc[:, relevant_columns]


In [None]:
ontime_percentage = (wego['ADJUSTED_ONTIME_COUNT'].value_counts(normalize=True) * 100).round(2)

# '%' added
ontime_percentage_formatted = ontime_percentage.map("{:.2f}%".format)

# Results
print("Percentage of ADJUSTED_ONTIME_COUNT values:")
print(ontime_percentage_formatted)

In [None]:
#Uncleaned 
wego['HDWY_DEV_PCT'] = wego['HDWY_DEV'] / wego['SCHEDULED_HDWY']

In [None]:
#Cleaned PCT without inf and NaN's
wego['HDWY_DEV_PCT_C'] = wego['HDWY_DEV'] / wego['SCHEDULED_HDWY']

# Replace inf values with NaN
wego['HDWY_DEV_PCT_C'] = wego['HDWY_DEV_PCT_C'].replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN values
wego_cleaned = wego.dropna(subset=['HDWY_DEV_PCT_C'])

In [None]:
#Plots
plt.figure(figsize=(12, 6))

# Distribution of adherence
plt.subplot(1, 2, 1)
sns.histplot(filtered_data['ADHERENCE'], kde=True, color='lightpink')
plt.title('Distribution of Adherence')
plt.xlabel('Adherence (minutes)')
plt.ylabel('Frequency')


# Distribution of headway deviation
plt.subplot(1, 2, 2)
sns.histplot(filtered_data['HDWY_DEV'], kde=True, color='lightpink')
plt.title('Distribution of Headway Deviation')
plt.xlabel('Headway Deviation (minutes)')
plt.ylabel('Frequency')


plt.tight_layout()
plt.show()

In [None]:
#Histograms with pretty lines
plt.figure(figsize=(12, 6))

#'ADHERENCE'
plt.subplot(1, 2, 1)
sns.histplot(filtered_data['ADHERENCE'], kde=True, color='lightpink')
plt.title('Histogram of Adherence')
plt.xlabel('Adherence (minutes)')
plt.ylabel('Frequency')
plt.axvline(x=-6, color='purple', linestyle='--', label='Late Threshold (-6 min)')  # Adding a line indicating late threshold
plt.axvline(x=1, color='green', linestyle='--', label='Early Threshold (1 min)')  # Addin a line indicating early threshold
plt.xlim(left=-30, right=30)
plt.legend()

#'HDWY_DEV' 
filtered_data['HDWY_DEV_PCT_C'] = filtered_data['HDWY_DEV'] / filtered_data['SCHEDULED_HDWY']
plt.subplot(1, 2, 2)
sns.histplot(filtered_data['HDWY_DEV_PCT_C'], kde=True, color='#6F4B9E')
plt.title('Headway Deviation Percent')
plt.xlabel('Headway Deviation Precent')
plt.ylabel('Frequency')
plt.xlim(left=-2, right=2)  


plt.tight_layout()
plt.show()

## 2. How does direction of travel, route, or location affect the headway and on-time performance?

In [None]:
grouped_abbr = wego.groupby('ROUTE_ABBR')

In [None]:
grouped_route = wego.groupby('ROUTE_DIRECTION_NAME')

In [None]:
statistics = grouped_abbr.agg({'HDWY_DEV_PCT_C': ['mean', 'std'], 'ADJUSTED_ONTIME_COUNT': 'mean'})

In [None]:
statistics

In [None]:
# Group by 'ROUTE_DIRECTION_NAME' & 'ROUTE_ABBR'
grouped_route = wego.groupby(['ROUTE_ABBR', 'ROUTE_DIRECTION_NAME'])

#Stats
statistic = grouped_route.agg({'HDWY_DEV_PCT_C': ['mean', 'std'], 'ADJUSTED_ONTIME_COUNT': 'mean'})

In [None]:
statistic

In [None]:
wego.query('TIME_POINT_ABBR == "SAMS"')[['TIME_POINT_ABBR', 'LATITUDE', 'LONGITUDE']]

In [None]:
grouped_time_points = wego.groupby('TIME_POINT_ABBR').agg({
    'LATITUDE': 'first',
    'LONGITUDE': 'first'
}).reset_index()

In [None]:
grouped_time_points

In [None]:
grouped_time_point = wego.groupby('TIME_POINT_ABBR').agg(
    LATITUDE=('LATITUDE', 'mean'),
    LONGITUDE=('LONGITUDE', 'mean'),
    COUNT=('TIME_POINT_ABBR', 'size')
).reset_index().sort_values(by='COUNT', ascending=False)


print(grouped_time_point)

In [None]:
#Adding column
wego['BUNCHING'] = np.where(wego['HDWY_DEV_PCT_C']<-0.5, 1, 0)

In [None]:
wego['GAPPING'] = np.where(wego['HDWY_DEV_PCT_C']>0.5, 1, 0)

In [None]:
#Adding column
wego['ACCEPTABLE_HDWY'] = np.where((wego['HDWY_DEV_PCT_C']>=-0.5) & (wego['HDWY_DEV_PCT_C']<=0.5), 1, 0)

In [None]:
wego.head()

In [None]:
wego.drop(columns=['ACCEPTING_HDWY'], inplace=True)

#John Micheals answer

For on time performace, route 22 (Bordeaux) is the most "on time" route, location MCC5_6 is the most "on time" location which is downtown around the wego central station, and going to downtown has a higher "on time" percentage than going from downtown.

## 3. How does time of day or day of week affect headway and on-time performance?


SERVICE_ABBR: 1 = Weekday, 2 = Saturday, 3 = Sunday.

In [None]:
#Converting columns into date & time
wego['SCHEDULED_TIME'] = pd.to_datetime(wego['SCHEDULED_TIME'])
wego['ACTUAL_ARRIVAL_TIME'] = pd.to_datetime(wego['ACTUAL_ARRIVAL_TIME'])
wego['ACTUAL_DEPARTURE_TIME'] = pd.to_datetime(wego['ACTUAL_DEPARTURE_TIME'])
wego.info()

In [None]:
wego.groupby('DAY_OF_WEEK')[['ADJUSTED_ONTIME_COUNT', 'ADJUSTED_EARLY_COUNT',
     'ADJUSTED_LATE_COUNT']].mean().sort_values('ADJUSTED_ONTIME_COUNT')

In [None]:
wego.groupby('ROUTE_ABBR')[['ADHERENCE', 'HDWY_DEV']].describe()

In [None]:
# Extract hour of the day and day of the week
wego['HOUR_OF_DAY'] = wego['SCHEDULED_TIME'].dt.hour
wego['DAY_OF_WEEK'] = wego['SCHEDULED_TIME'].dt.day_name() 

# Extract hour of the day and day of the week from ACTUAL_ARRIVAL_TIME
wego['ACTUAL_HOUR_OF_DAY'] = wego['ACTUAL_DEPARTURE_TIME'].dt.hour
wego['ACTUAL_DAY_OF_WEEK'] = wego['ACTUAL_DEPARTURE_TIME'].dt.day_name()  

In [None]:
# Group by HOUR_OF_DAY 
scheduled_hourly_stats = wego.groupby('HOUR_OF_DAY').agg({
    'HDWY_DEV_PCT_C': 'mean',
    'ADJUSTED_ONTIME_COUNT': 'mean'
}).reset_index()

#Column
scheduled_hourly_stats.columns = ['HOUR_OF_DAY', 'HDWY_DEV_PCT_C_MEAN_SCHEDULED', 'ONTIME_PERCENTAGE_MEAN_SCHEDULED']


# Group by ACTUAL HOUR_OF_DAY and calculate statistics
actual_hourly_stats = wego.groupby('ACTUAL_HOUR_OF_DAY').agg({
    'HDWY_DEV_PCT_C': 'mean',
    'ADJUSTED_ONTIME_COUNT': 'mean'
}).reset_index()
actual_hourly_stats.columns = ['ACTUAL_HOUR_OF_DAY', 'HDWY_DEV_PCT_C_MEAN_ACTUAL','ONTIME_PERCENTAGE_MEAN_ACTUAL']

actual_hourly_stats
scheduled_hourly_stats

In [None]:
# Group by SCHEDULED DAY_OF_WEEK and calculate statistics
scheduled_weekly_stats = wego.groupby('DAY_OF_WEEK').agg({
    'HDWY_DEV_PCT_C': 'mean',
    'ADJUSTED_ONTIME_COUNT': 'mean'
}).reset_index()

#Columns
scheduled_weekly_stats.columns = ['DAY_OF_WEEK', 'HDWY_DEV_PCT_C_MEAN_SCHEDULED', 'ONTIME_PERCENTAGE_MEAN_SCHEDULED']


# Group by ACTUAL DAY_OF_WEEK and calculate statistics
actual_weekly_stats = wego.groupby('ACTUAL_DAY_OF_WEEK').agg({
    'HDWY_DEV_PCT_C': 'mean',
    'ADJUSTED_ONTIME_COUNT': 'mean'
}).reset_index()
actual_weekly_stats.columns = ['ACTUAL_DAY_OF_WEEK', 'HDWY_DEV_PCT_C_MEAN_ACTUAL', 'ONTIME_PERCENTAGE_MEAN_ACTUAL']

print(scheduled_weekly_stats)
print(actual_weekly_stats)

In [None]:

plt.figure(figsize=(14, 12))

# Plot for Scheduled vs Actual Hourly Statistics
plt.subplot(2, 1, 1)
sns.lineplot(data=scheduled_hourly_stats, x='HOUR_OF_DAY', y='HDWY_DEV_PCT_C_MEAN_SCHEDULED', label='Scheduled Headway Deviation Mean', marker='o')
sns.lineplot(data=actual_hourly_stats, x='ACTUAL_HOUR_OF_DAY', y='HDWY_DEV_PCT_C_MEAN_ACTUAL', label='Actual Headway Deviation Mean', marker='o')
sns.lineplot(data=scheduled_hourly_stats, x='HOUR_OF_DAY', y='ONTIME_PERCENTAGE_MEAN_SCHEDULED', label='Scheduled On-Time Percentage Mean', marker='o')
sns.lineplot(data=actual_hourly_stats, x='ACTUAL_HOUR_OF_DAY', y='ONTIME_PERCENTAGE_MEAN_ACTUAL', label='Actual On-Time Percentage Mean', marker='o')
plt.title('Hourly Headway Deviation and On-Time Performance (Scheduled vs Actual)')
plt.xlabel('Hour of Day')
plt.ylabel('Mean Value')
plt.legend()
plt.grid(True)

# Plot for Scheduled vs Actual Weekly Statistics
plt.subplot(2, 1, 2)
sns.lineplot(data=scheduled_weekly_stats, x='DAY_OF_WEEK', y='HDWY_DEV_PCT_C_MEAN_SCHEDULED', label='Scheduled Headway Deviation Mean', marker='o')
sns.lineplot(data=actual_weekly_stats, x='ACTUAL_DAY_OF_WEEK', y='HDWY_DEV_PCT_C_MEAN_ACTUAL', label='Actual Headway Deviation Mean', marker='o')
sns.lineplot(data=scheduled_weekly_stats, x='DAY_OF_WEEK', y='ONTIME_PERCENTAGE_MEAN_SCHEDULED', label='Scheduled On-Time Percentage Mean', marker='o')
sns.lineplot(data=actual_weekly_stats, x='ACTUAL_DAY_OF_WEEK', y='ONTIME_PERCENTAGE_MEAN_ACTUAL', label='Actual On-Time Percentage Mean', marker='o')
plt.title('Weekly Headway Deviation and On-Time Performance (Scheduled vs Actual)')
plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
plt.ylabel('Mean Value')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

4. How much of a factor does the driver have on headway and on-time performance? The driver is indicated by the OPERATOR variable.

In [None]:
grouped_operator = wego.groupby('OPERATOR').agg({
    'HDWY_DEV_PCT':'mean',
    'ADHERENCE': 'mean',
    'ADJUSTED_ONTIME_COUNT': 'mean'
}).reset_index()

# Rename columns 
grouped_operator.columns = ['OPERATOR', 'HDWY_DEV_PCT_MEAN', 'ADHERENCE_MEAN', 'ONTIME_PERCENTAGE_MEAN']

# Display the first few rows 
print(grouped_operator.head(10))

In [None]:
# The figure
plt.figure(figsize=(14, 10))

# Plot Headway Deviation Percentage Mean
plt.subplot(2, 1, 1)
sns.barplot(x='OPERATOR', y='HDWY_DEV_PCT_MEAN', data=grouped_operator, palette='viridis')
plt.title('Mean Headway Deviation Percentage by Operator')
plt.xlabel('Operator')
plt.ylabel('Mean Headway Deviation Percentage')
plt.xticks(rotation=90)
plt.grid(True)

# Plot On-Time Performance Mean
plt.subplot(2, 1, 2)
sns.barplot(x='OPERATOR', y='ONTIME_PERCENTAGE_MEAN', data=grouped_operator, palette='viridis')
plt.title('Mean On-Time Performance by Operator')
plt.xlabel('Operator')
plt.ylabel('Mean On-Time Performance')
plt.xticks(rotation=90)
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
# Group data by operator and calculate the mean 
grouped_operator = wego.groupby('OPERATOR').agg({
    'HDWY_DEV_PCT': 'mean',
    'ADHERENCE': 'mean',
    'ADJUSTED_ONTIME_COUNT': 'mean'
}).reset_index()

# Rename columns 
grouped_operator.columns = ['OPERATOR', 'HDWY_DEV_PCT_MEAN', 'ADHERENCE_MEAN', 'ONTIME_PERCENTAGE_MEAN']

# Sort by on-time performance and select the top 10 operators
top_operators = grouped_operator.sort_values(by='ONTIME_PERCENTAGE_MEAN', ascending=False).head(10)

# Display 
print(top_operators.head())

In [None]:
# Set up the figure
plt.figure(figsize=(14, 10))

# Plot Headway Deviation Percentage Mean for top 10 operators
plt.subplot(2, 1, 1)
sns.barplot(x='OPERATOR', y='HDWY_DEV_PCT_MEAN', data=top_operators, palette='viridis')
plt.title('Mean Headway Deviation Percentage by Top 10 Operators')
plt.xlabel('Operator')
plt.ylabel('Mean Headway Deviation Percentage')
plt.xticks(rotation=30)
plt.grid(True)

# Plot On-Time Performance Mean for top 10 operators
plt.subplot(2, 1, 2)
sns.barplot(x='OPERATOR', y='ONTIME_PERCENTAGE_MEAN', data=top_operators, palette='viridis')
plt.title('Mean On-Time Performance by Top 10 Operators')
plt.xlabel('Operator')
plt.ylabel('Mean On-Time Performance')
plt.xticks(rotation=30)
plt.grid(True)

plt.tight_layout()
plt.show()

5. Is there any relationship between lateness (ADHERENCE) and headway deviation?

In [None]:
# Set up the figure
plt.figure(figsize=(10, 6))

# Scatter plot of ADHERENCE vs. HDWY_DEV
sns.scatterplot(x='ADHERENCE', y='HDWY_DEV', data=wego, color='#6F4B9E')
plt.title('Scatter Plot of Adherence vs. Headway Deviation')
plt.xlabel('Adherence (minutes)')
plt.ylabel('Headway Deviation (minutes)')


# Regression line
sns.regplot(x='ADHERENCE', y='HDWY_DEV', data=filtered_data, scatter=False, color='red')

plt.grid(True)
plt.show()