# GTFS Data Analysis

This notebook analyzes the TGSRTC GTFS data.

## Setup
Ensure you have the required libraries installed.

In [None]:
# !pip install pandas folium matplotlib

In [None]:
import pandas as pd
import folium
import matplotlib.pyplot as plt
import os

# Set plot style
plt.style.use('ggplot')

## 1. Load Data

In [None]:
DATA_DIR = '.'  # Current directory

files = {
    'agency': 'agency.txt',
    'calendar': 'calendar.txt',
    'routes': 'routes.txt',
    'shapes': 'shapes.txt',
    'stop_times': 'stop_times.txt',
    'stops': 'stops.txt',
    'trips': 'trips.txt'
}

dfs = {}

for name, filename in files.items():
    path = os.path.join(DATA_DIR, filename)
    if os.path.exists(path):
        print(f"Loading {name}...")
        # trip_id and route_id should be treated as strings to avoid mixed types
        dtype_dict = {'trip_id': str, 'route_id': str, 'stop_id': str, 'service_id': str}
        dfs[name] = pd.read_csv(path, dtype=dtype_dict)
        print(f"  {name}: {dfs[name].shape}")
    else:
        print(f"Warning: {filename} not found.")


## 2. Data Cleaning

GTFS times can exceed 24:00:00 (e.g., 25:30:00 means 1:30 AM the next day). We need to handle this for analysis.

In [None]:
def convert_gtfs_time(time_str):
    """
    Converts GTFS time string (HH:MM:SS) to a timedelta.
    Handles hours > 24.
    """
    if pd.isna(time_str):
        return None
    
    try:
        h, m, s = map(int, time_str.split(':'))
        return pd.Timedelta(hours=h, minutes=m, seconds=s)
    except Exception as e:
        print(f"Error converting {time_str}: {e}")
        return None

if 'stop_times' in dfs:
    print("Converting stop_times arrival_time... (this may take a moment)")
    # We iterate only on a subset or just apply directly if memory allows. 1.2M rows is fine for Pandas.
    dfs['stop_times']['arrival_time_dt'] = dfs['stop_times']['arrival_time'].apply(convert_gtfs_time)
    dfs['stop_times']['departure_time_dt'] = dfs['stop_times']['departure_time'].apply(convert_gtfs_time)
    
    # Create a normalized hour column (0-23) for frequency analysis
    # Total seconds // 3600 % 24
    dfs['stop_times']['hour'] = (dfs['stop_times']['arrival_time_dt'].dt.total_seconds() // 3600 % 24).astype(int)
    print("Conversion complete.")

## 3. Visualization: Stops Map

Visualizing stops to understand the coverage.

In [None]:
if 'stops' in dfs:
    stops_df = dfs['stops']
    
    # Check for valid lat/lon
    stops_df = stops_df.dropna(subset=['stop_lat', 'stop_lon'])
    
    mean_lat = stops_df['stop_lat'].mean()
    mean_lon = stops_df['stop_lon'].mean()
    
    m = folium.Map(location=[mean_lat, mean_lon], zoom_start=11)
    
    # Plot a sample of stops if too many, otherwise all
    # For 1.2M stop_times, stops might be around 3-10k. 
    # Let's check size first.
    print(f"Plotting {len(stops_df)} stops...")
    
    # If > 5000 stops, use CircleMarker for performance or Cluster
    from folium.plugins import MarkerCluster
    marker_cluster = MarkerCluster().add_to(m)
    
    for idx, row in stops_df.iterrows():
        folium.CircleMarker(
            location=[row['stop_lat'], row['stop_lon']],
            radius=3,
            color='blue',
            fill=True,
            popup=row['stop_name']
        ).add_to(marker_cluster)
        
    m.save('stops_map.html')
    m