# Exploratory Data Analysis

This notebook demonstrates how to load all processed real-time GTFS files and perform a basic exploratory analysis.

In [1]:
from pathlib import Path

import pandas as pd

from metro_disruptions_intelligence.processed_reader import load_rt_dataset

In [2]:
project_root = Path.cwd()
if not (project_root / "data").exists():
    project_root = project_root.parent
processed_rt = project_root / "data" / "processed_final" / "rt"

In [3]:
print(processed_rt)

c:\Users\Luis.ParraMorales\GitProjects\metro_disruptions_intelligence\data\processed_final\rt


In [None]:
df = load_rt_dataset(processed_root = processed_rt, output_file =  processed_rt / "all_feeds.parquet")
df.head()

  df = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)


Unnamed: 0,snapshot_timestamp,alert_entity_id,active_period_start,active_period_end,agency_id,route_id,direction_id,cause,effect,header_text,...,arrival_delay,departure_delay,latitude,longitude,bearing,speed,current_stop_sequence,current_status,congestion_level,occupancy_status
0,1741277966,a2eb7e5f-419b-5736-b012-17f03aebc060,1742209000.0,1742224000.0,SMNW,SMNW_M1,1,9,6,Metro services do not run between Central and ...,...,,,,,,,,,,
1,1741277966,a2eb7e5f-419b-5736-b012-17f03aebc060,1742209000.0,1742224000.0,SMNW,SMNW_M1,0,9,6,Metro services do not run between Central and ...,...,,,,,,,,,,
2,1741277966,a2eb7e5f-419b-5736-b012-17f03aebc060,1742295000.0,1742310000.0,SMNW,SMNW_M1,1,9,6,Metro services do not run between Central and ...,...,,,,,,,,,,
3,1741277966,a2eb7e5f-419b-5736-b012-17f03aebc060,1742295000.0,1742310000.0,SMNW,SMNW_M1,0,9,6,Metro services do not run between Central and ...,...,,,,,,,,,,
4,1741277966,a2eb7e5f-419b-5736-b012-17f03aebc060,1742381000.0,1742396000.0,SMNW,SMNW_M1,1,9,6,Metro services do not run between Central and ...,...,,,,,,,,,,


In [9]:
df["feed_type"].value_counts()

feed_type
trip_updates         87989792
alerts                3213800
vehicle_positions     1662564
Name: count, dtype: int64

In [14]:
print(df.columns)

Index(['snapshot_timestamp', 'alert_entity_id', 'active_period_start',
       'active_period_end', 'agency_id', 'route_id', 'direction_id', 'cause',
       'effect', 'header_text', 'description_text', 'url', 'year', 'month',
       'day', 'feed_type', 'trip_id', 'start_time', 'start_date', 'vehicle_id',
       'stop_sequence', 'stop_id', 'arrival_time', 'departure_time',
       'arrival_delay', 'departure_delay', 'latitude', 'longitude', 'bearing',
       'speed', 'current_stop_sequence', 'current_status', 'congestion_level',
       'occupancy_status'],
      dtype='object')


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92866156 entries, 0 to 92866155
Data columns (total 34 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   snapshot_timestamp     int64  
 1   alert_entity_id        object 
 2   active_period_start    float64
 3   active_period_end      float64
 4   agency_id              object 
 5   route_id               object 
 6   direction_id           int64  
 7   cause                  object 
 8   effect                 object 
 9   header_text            object 
 10  description_text       object 
 11  url                    object 
 12  year                   int64  
 13  month                  int64  
 14  day                    int64  
 15  feed_type              object 
 16  trip_id                object 
 17  start_time             object 
 18  start_date             object 
 19  vehicle_id             object 
 20  stop_sequence          float64
 21  stop_id                object 
 22  arrival_time    

### Exploring missing columns on parquets

In [None]:
from pathlib import Path
import pyarrow.parquet as pq
from metro_disruptions_intelligence.processed_reader import (
    compose_path,
    discover_all_snapshot_minutes,
)

# processed_rt should already point to your processed realtime directory
# processed_rt = project_root / "data" / "processed_final" / "rt"

minutes = discover_all_snapshot_minutes(processed_rt)
missing = {feed: [] for feed in ["alerts", "trip_updates", "vehicle_positions"]}

for ts in minutes:
    for feed in missing:
        path = compose_path(ts, processed_rt, feed)
        if not path.exists():
            continue
        schema = pq.read_schema(path)
        if "snapshot_timestamp" not in schema.names:
            missing[feed].append(path)

for feed, paths in missing.items():
    print(f"\n{feed} ({len(paths)} files missing snapshot_timestamp)")
    for p in paths:
        print(" ", p)

In [None]:
minutes = discover_all_snapshot_minutes(processed_rt)
missing_columns = []
null_values = []

for ts in minutes:
    file = compose_path(ts, processed_rt, "trip_updates")
    if not file.exists():
        continue

    df = pd.read_parquet(file)

    required = {"route_id", "direction_id"}
    cols_missing = required - set(df.columns)

    if cols_missing:
        missing_columns.append((file, sorted(cols_missing)))
        continue

    if df[["route_id", "direction_id"]].isna().any().any():
        null_values.append(file)

print("Files missing columns:", missing_columns)
print("Files with null route/direction_id:", null_values)