In [2]:
!pip install beautifulsoup4



In [3]:
import pandas as pd

In [4]:
from google.colab import files

In [6]:
file_path = '/content/drive/MyDrive/trimet_stopevents_2022-12-07.html'

In [7]:
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

In [8]:
with open(file_path, 'r') as file:
    soup = BeautifulSoup(file, 'html.parser')

In [9]:
all_records = []

trip_headers = soup.find_all('h2')

for heading in trip_headers:
    trip_id = heading.text.split()[-1]
    table = heading.find_next_sibling('table')
    if not table:
        continue

    headers = [th.text.strip() for th in table.find_all('th')]

    for row in table.find_all('tr')[1:]:
        cells = row.find_all('td')
        if len(cells) == len(headers):
            record = [cell.text.strip() for cell in cells]
            record_dict = dict(zip(headers, record))
            record_dict['trip_id'] = trip_id
            all_records.append(record_dict)

stops_df = pd.DataFrame(all_records)
stops_df['arrive_time'] = pd.to_numeric(stops_df['arrive_time'])

date = datetime(2022, 12, 7)
stops_df['tstamp'] = stops_df['arrive_time'].apply(lambda x: date + timedelta(seconds=x))

stops_df['vehicle_number'] = stops_df['vehicle_number'].astype(str)
stops_df['trip_id'] = stops_df['trip_id'].astype(str)
stops_df['location_id'] = stops_df['location_id'].astype(str)

stops_df['ons'] = pd.to_numeric(stops_df['ons'])
stops_df['offs'] = pd.to_numeric(stops_df['offs'])

In [10]:
stops_df = stops_df[['trip_id', 'vehicle_number', 'tstamp', 'location_id', 'ons', 'offs']]
print(f"The total no. of stop events : {len(stops_df)}")

The total no. of stop events : 93912


In [11]:
vehicles = stops_df['vehicle_number'].nunique()
print(f"The total no. of unique Vehicles : {(vehicles)}")

The total no. of unique Vehicles : 158


In [12]:
locations = stops_df['location_id'].nunique()
print(f"The total no. of unique Stop Locations : {(locations)}")

The total no. of unique Stop Locations : 4354


In [13]:
min = stops_df['tstamp'].min()
print(f"The minimun timestamp is noted as : {(min)}")

The minimun timestamp is noted as : 2022-12-07 04:02:29


In [14]:
max = stops_df['tstamp'].max()
print(f"The maximum timestamp is noted as : {(max)}")

The maximum timestamp is noted as : 2022-12-08 02:37:41


In [15]:
boarding_stats = (stops_df['ons'] >= 1).sum()
print(f"The total no. of stop events at which at least one passenger boarded : {boarding_stats}")

The total no. of stop events at which at least one passenger boarded : 19858


In [16]:
percentage = (boarding_stats / len(stops_df)) * 100
print(f"The percentage of stop events with at least one passenger boarding : {percentage}")

The percentage of stop events with at least one passenger boarding : 21.14532754067638


In [17]:
location_df = stops_df[stops_df['location_id'] == '6913']
print(f"The total no. of stops made at 6913 location are : {len(location_df)}")

The total no. of stops made at 6913 location are : 15


In [18]:
stop_stats = location_df['vehicle_number'].nunique()
print(f"The total no. of different buses stopped at 6913 location : {(stop_stats)}")

The total no. of different buses stopped at 6913 location : 5


In [19]:
boarding = (location_df['ons'] >= 1).sum()
print(f"The total No. of stops at 6913 location with at least one passenger boarding : {boarding}")

The total No. of stops at 6913 location with at least one passenger boarding : 2


In [20]:
percentages = (boarding  / len(location_df)) * 100
print(f"The percentage of stops at 6913 location did at least one passenger board : {percentages}")

The percentage of stops at 6913 location did at least one passenger board : 13.333333333333334


In [21]:
vehicle_df = stops_df[stops_df['vehicle_number'] == '4062']
print(f"The total no. of stops made by 4062 vehicle are : {len(vehicle_df)}")

The total no. of stops made by 4062 vehicle are : 68


In [22]:
board_stats = vehicle_df['ons'].sum()
print(f"The total no. of passengers boarding the 4062 vehicle are : {board_stats}")

The total no. of passengers boarding the 4062 vehicle are : 26


In [23]:
deboard_stats = vehicle_df['offs'].sum()
print(f"The total no. of passengers deboarding the 4062 vehicle are : {deboard_stats}")

The total no. of passengers deboarding the 4062 vehicle are : 26


In [24]:
board_stats_one = (vehicle_df['ons'] >= 1).sum()
percent = (board_stats_one  / len(vehicle_df)) * 100
print(f"The percentage of 4062 vehicle stop events did at least one passenger board : {percent}")

The percentage of 4062 vehicle stop events did at least one passenger board : 16.176470588235293


In [25]:
!pip install scipy



In [26]:
from scipy.stats import binomtest

In [27]:
count_events = len(stops_df)
count_bus_boarding_events = (stops_df['ons'] >= 1).sum()

event_percentage = count_bus_boarding_events / count_events
print(f"The percentage of stop events with boardings : {event_percentage}")

The percentage of stop events with boardings : 0.21145327540676379


In [28]:
vehicle_data = stops_df.groupby('vehicle_number').agg(all_stops=('ons', 'count'),boardings=('ons', lambda x: (x >= 1).sum())).reset_index()

vehicle_data['all_stops'] = vehicle_data['all_stops']
vehicle_data['boardings'] = vehicle_data['boardings']

vehicle_data['boarding_avg'] = vehicle_data['boardings'] / vehicle_data['all_stops']

vehicle_data['p_value'] = vehicle_data.apply(
    lambda row: binomtest(
        int(row['boardings']),int(row['all_stops']),event_percentage
        ).pvalue,
    axis=1
)

bus_bias = vehicle_data[vehicle_data['p_value'] < 0.05].sort_values('p_value')
print(f"The below is the result set for binomial test to determine which buses have biased “ons” data (p < 0.05) :")
print(bus_bias[['vehicle_number', 'boarding_avg', 'p_value']])


The below is the result set for binomial test to determine which buses have biased “ons” data (p < 0.05) :
    vehicle_number  boarding_avg   p_value
113           3915      0.173716  0.017249
70            3530      0.173611  0.028077
125           3963      0.167901  0.033011
103           3733      0.180536  0.043074
88            3634      0.171149  0.045715


In [39]:
import pandas as pd
from google.colab import files
from scipy.stats import ttest_ind

In [41]:
gps_df = pd.read_csv("/content/drive/MyDrive/trimet_relpos_2022-12-07.csv")

bus_biases = []

all_relpos = gps_df['RELPOS'].dropna().values

bus_data = gps_df.dropna(subset=['RELPOS']).groupby('VEHICLE_NUMBER')

for vehicle_id, group in bus_data:
    bus_relpos = group['RELPOS'].values

    t_stat, p_value = ttest_ind(bus_relpos, all_relpos, equal_var=False)

    if p_value < 0.005:
        bus_biases.append({
            'vehicle_number': vehicle_id,
            'total_gps': len(bus_relpos),
            'p_value': p_value
        })

biased_gps_df = pd.DataFrame(bus_biases)
biased_gps_df = biased_gps_df.sort_values('p_value')

print(f"The below is the result set for T-test to determine which buses have biased GPS data (p < 0.005) :")
print(biased_gps_df[[ 'vehicle_number', 'total_gps', 'p_value']])

The below is the result set for T-test to determine which buses have biased GPS data (p < 0.005) :
   vehicle_number  total_gps   p_value
0            3638      10968  0.000000
1            3804      12491  0.000000
3            4305       5770  0.000000
2            4024      12119  0.004008


In [42]:
from scipy.stats import chi2_contingency

total_offs = stops_df['offs'].sum()
total_ons = stops_df['ons'].sum()

vehicle_bias = []

all_vehicles = stops_df.groupby('vehicle_number').agg(bus_ons = ('ons', 'sum'), bus_offs = ('offs', 'sum')).reset_index()

for _, row in all_vehicles.iterrows():
    bus_ons = row['bus_ons']
    bus_offs = row['bus_offs']

    contingency_creation = [
        [bus_ons, bus_offs],
        [total_ons - bus_ons, total_offs - bus_offs]
    ]

    chi2, p_value, _, _ = chi2_contingency(contingency_creation)

    if p_value < 0.05:
        vehicle_bias.append({
            'vehicle_number': vehicle_id,
            'sum_ons': bus_ons,
            'sum_offs': bus_offs,
            'p_value': p_value
        })

biased_ons_offs_df = pd.DataFrame(vehicle_bias)
biased_ons_offs_df = biased_ons_offs_df.sort_values('p_value')

print(f"The below is the result set for Chi-squared test to determine which buses have biased ons and offs data (p < 0.005) : ")
print(biased_ons_offs_df[['vehicle_number', 'sum_ons', 'sum_offs', 'p_value']])

The below is the result set for Chi-squared test to determine which buses have biased ons and offs data (p < 0.005) : 
   vehicle_number  sum_ons  sum_offs   p_value
1            4305      379       322  0.018783
0            4305      517       457  0.030134
