In [None]:
# ===========================
# Imports & Config
# ===========================
import requests
from google.transit import gtfs_realtime_pb2
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import joblib
try:
    import geopandas as gpd
    from shapely.geometry import Point
    GEOPANDAS_AVAILABLE = True
except ImportError:
    GEOPANDAS_AVAILABLE = False

# ---- Main configuration ----
DATADIRECTORY =  r"C:\Users\hp\OneDrive\Desktop\DesktopFolders\data"
GOCARYVEHICLEPOSITIONSURL = "https://www.gocarylive.org/GTFSRealtime/GTFSVehiclePositions.pb"
GOCARYTRIPUPDATESURL = "https://www.gocarylive.org/GTFSRealtime/GTFSTripUpdates.pb"
GOTRIANGLEVEHICLEPOSITIONSURL = "https://gotriangle.tripsparkhost.com/gtfsRealtime/GTFSVehiclePositions.pb"
POPULATIONGEOJSONFILE = "CensusBlockGroups2020.geojson"

def log(msg: str):
    print(f"[INFO] {datetime.now():%Y-%m-%d %H:%M:%S} - {msg}")

# ===========================
# Utility & Data Functions
# ===========================
def fetchgtfsrealtimedata(url: str):
    """Fetches and parses a GTFS-RT feed from a given URL."""
    feed = gtfs_realtime_pb2.FeedMessage()
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code == 200:
            feed.ParseFromString(response.content)
            return feed
        else:
            log(f"Error fetching {url} - Status {response.status_code}")
            return None
    except Exception as e:
        log(f"An error occurred fetching {url}: {e}")
        return None

def parsevehiclepositions(feed):
    """Parses vehicle position data from a feed message."""
    busdata = {}
    if not feed: return busdata
    for entity in feed.entity:
        if entity.HasField('vehicle') and entity.vehicle.trip.trip_id:
            tripid = entity.vehicle.trip.trip_id
            busdata[tripid] = {
                "vehicleid": entity.vehicle.vehicle.id,
                "latitude": entity.vehicle.position.latitude,
                "longitude": entity.vehicle.position.longitude
            }
    return busdata

def parsetripupdates(feed):
    """Parses trip update data delays from a feed message."""
    tripupdates = {}
    if not feed: return tripupdates
    for entity in feed.entity:
        if entity.HasField('trip_update') and entity.trip_update.trip.trip_id:
            tripid = entity.trip_update.trip.trip_id
            for stopupdate in entity.trip_update.stop_time_update:
                if stopupdate.HasField('arrival') and stopupdate.arrival.HasField('delay'):
                    tripupdates[tripid] = stopupdate.arrival.delay / 60.0
                    break
    return tripupdates

def savedatatocsv(data, filepath):
    df = pd.DataFrame(data)
    if not os.path.isfile(filepath):
        df.to_csv(filepath, index=False)
    else:
        df.to_csv(filepath, mode='a', header=False, index=False)

def rundatacollection(datadirectory, durationminutes=None):
    starttime = time.time()
    gocarycsv = os.path.join(datadirectory, "gocarydata.csv")
    gotrianglecsv = os.path.join(datadirectory, "gotriangledata.csv")
    log(f"Starting data collection. Data will be saved in {datadirectory}")
    if durationminutes:
        log(f"This will run for {durationminutes} minutes.")
    else:
        log("Press Ctrl+C to stop.")
    while True:
        if durationminutes and (time.time() - starttime > durationminutes * 60):
            log("Specified duration reached. Stopping data collection.")
            break
        try:
            gcpositions = parsevehiclepositions(fetchgtfsrealtimedata(GOCARYVEHICLEPOSITIONSURL))
            gcupdates = parsetripupdates(fetchgtfsrealtimedata(GOCARYTRIPUPDATESURL))
            combinedgocarydata = []
            for tripid, posdata in gcpositions.items():
                record = {
                    "timestamp": datetime.now().isoformat(),
                    "tripid": tripid,
                    **posdata,
                    "delayminutes": gcupdates.get(tripid)
                }
                combinedgocarydata.append(record)
            if combinedgocarydata:
                savedatatocsv(combinedgocarydata, gocarycsv)
                log(f"Saved {len(combinedgocarydata)} GoCary records.")
            gtpositions = parsevehiclepositions(fetchgtfsrealtimedata(GOTRIANGLEVEHICLEPOSITIONSURL))
            combinedgotriangledata = []
            for tripid, posdata in gtpositions.items():
                record = {
                    "timestamp": datetime.now().isoformat(),
                    "tripid": tripid,
                    **posdata,
                    "delayminutes": None
                }
                combinedgotriangledata.append(record)
            if combinedgotriangledata:
                savedatatocsv(combinedgotriangledata, gotrianglecsv)
                log(f"Saved {len(combinedgotriangledata)} GoTriangle records.")
            time.sleep(30)
        except KeyboardInterrupt:
            log("Data collection stopped by user.")
            break
        except Exception as e:
            log(f"Unexpected error: {e}")
            time.sleep(30)

# ===========================
# Data Analysis
# ===========================
def analyzeontimeperformance(datadirectory):
    log("--- Starting On-Time Performance Analysis ---")
    realtimecsv = os.path.join(datadirectory, "gocarydata.csv")
    staticfolder = os.path.join(datadirectory, "gtfsstatic")
    outputcsv = os.path.join(datadirectory, "gocaryanalysis.csv")
    if not os.path.exists(realtimecsv) or not os.path.isdir(staticfolder):
        log("Error: Missing required files in datadirectory. Ensure gocarydata.csv and gtfsstatic folder exist.")
        return
    try:
        realtimedf = pd.read_csv(realtimecsv)
        realtimedf.dropna(subset=["delayminutes"], inplace=True)
        realtimedf["tripid"] = realtimedf["tripid"].astype(str)
        routesdf = pd.read_csv(os.path.join(staticfolder, "routes.txt"))
        tripsdf = pd.read_csv(os.path.join(staticfolder, "trips.txt"))
        tripsdf["tripid"] = tripsdf["tripid"].astype(str)
        tripswithroutes = pd.merge(tripsdf, routesdf, on="routeid")
        analysisdf = pd.merge(realtimedf, tripswithroutes[["tripid", "routeshortname"]], on="tripid", how="left")
        finalcolumns = ["timestamp", "routeshortname", "vehicleid", "latitude", "longitude", "delayminutes", "tripid"]
        finaldf = analysisdf[[col for col in finalcolumns if col in analysisdf.columns]]
        finaldf.to_csv(outputcsv, index=False)
        log(f"--- Analysis complete! Results saved to {outputcsv} ---")
        log("--- Sample of Final Analysis Data ---")
        log(finaldf.head())
    except Exception as e:
        log(f"Error loading data files: {e}")

# ===========================
# Visualization
# ===========================
def visualizedata():
    log("--- Starting Data Visualization ---")
    analysisfile = os.path.join(DATADIRECTORY, "gocaryanalysis.csv")
    if not os.path.exists(analysisfile):
        log(f"Error: Analysis file not found at {analysisfile}. Run analyze mode first.")
        return
    try:
        df = pd.read_csv(analysisfile)
        if df.empty:
            log("Warning: The analysis file is empty. No data to visualize.")
            return
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        log("--- Key Performance Statistics ---")
        avgdelay = df["delayminutes"].mean()
        log(f"Average Delay: {avgdelay:.2f} minutes")
        ontimethresholdupper, ontimethresholdlower = 5, -2
        ontime = df[(df["delayminutes"] >= ontimethresholdlower) & (df["delayminutes"] <= ontimethresholdupper)]
        log(f"On-Time Percentage: {len(ontime) / len(df) * 100:.1f}%")
        late = df[df["delayminutes"] > ontimethresholdupper]
        log(f"Late Percentage: {len(late) / len(df) * 100:.1f}%")
        early = df[df["delayminutes"] < ontimethresholdlower]
        log(f"Early Percentage: {len(early) / len(df) * 100:.1f}%")
        sns.set_style("whitegrid")
        plt.figure(figsize=(10, 6))
        sns.histplot(df["delayminutes"], bins=30, kde=True)
        plt.title("Distribution of GoCary Bus Delays", fontsize=16)
        plt.xlabel("Delay Minutes"); plt.ylabel("Number of Observations")
        plt.xlim(-15, 20)
        plt.axvline(x=0, color="black", linestyle="--"); plt.text(0.5, plt.ylim()[1]*0.9, "On-Time", color="black")
        distpath = os.path.join(DATADIRECTORY, "delaydistribution.png")
        plt.savefig(distpath); plt.close()
        log(f"Saved delay distribution plot to {distpath}")
        plt.figure(figsize=(12, 8))
        order = df.groupby("routeshortname")["delayminutes"].median().sort_values().index
        sns.boxplot(x="routeshortname", y="delayminutes", data=df, order=order)
        plt.xticks(rotation=45); plt.tight_layout()
        routepath = os.path.join(DATADIRECTORY, "delaybyroute.png")
        plt.savefig(routepath); plt.close()
        log(f"Saved delay by route plot to {routepath}")
        plt.figure(figsize=(12, 6))
        df.set_index("timestamp", inplace=True)
        hourlydelay = df["delayminutes"].resample("H").mean()
        hourlydelay.plot(marker="o")
        plt.title("Average Delay by Hour of Day")
        plt.axhline(y=0, color="black", linestyle="--")
        timepath = os.path.join(DATADIRECTORY, "delayovertime.png")
        plt.savefig(timepath)
        plt.close()
        log(f"Saved delay over time plot to {timepath}")
    except Exception as e:
        log(f"Unexpected error during visualization: {e}")

# ===========================
# Map Generation
# ===========================
def plotlayeredmap():
    log("--- Starting Layered Map Generation ---")
    staticfolder = os.path.join(DATADIRECTORY, "gtfsstatic")
    shapesfile = os.path.join(staticfolder, "shapes.txt")
    routesfile = os.path.join(staticfolder, "routes.txt")
    tripsfile = os.path.join(staticfolder, "trips.txt")
    stopsfile = os.path.join(staticfolder, "stops.txt")
    populationfile = os.path.join(DATADIRECTORY, POPULATIONGEOJSONFILE)
    outputmapfile = os.path.join(DATADIRECTORY, "gocarylayeredmap.html")
    if not all(map(os.path.exists, [shapesfile, routesfile, tripsfile, stopsfile])):
        log("Error: Required GTFS file(s) not found.")
        return
    try:
        log("Loading GTFS schedule data...")
        shapesdf = pd.read_csv(shapesfile)
        tripsdf = pd.read_csv(tripsfile)
        routesdf = pd.read_csv(routesfile)
        stopsdf = pd.read_csv(stopsfile)
        m = folium.Map(location=[35.7915, -78.7811], zoom_start=12, tiles="CartoDB positron")
        if GEOPANDAS_AVAILABLE and os.path.exists(populationfile):
            log("Processing population data for choropleth layer...")
            populationgdf = gpd.read_file(populationfile)
            if populationgdf.crs != "EPSG:4326":
                populationgdf = populationgdf.to_crs("EPSG:4326")
            folium.Choropleth(
                geo_data=populationgdf,
                name="Population Density",
                data=populationgdf,
                columns=["GEOID", "POPULATION"],
                key_on="feature.properties.GEOID",
                fill_color="YlGn",
                fill_opacity=0.6,
                line_opacity=0.2,
                legend_name="Population per Census Block Group (2020)",
                show=False
            ).add_to(m)
        else:
            log("Population data or geopandas not available. Choropleth skipped.")
        log("Processing individual bus route layers...")
        tripswithroutes = pd.merge(tripsdf, routesdf, on="routeid")
        shapetoroute = tripswithroutes.drop_duplicates("shapeid")
        shapesinfo = pd.merge(shapesdf, shapetoroute[["shapeid", "routeshortname", "routelongname", "routecolor"]], on="shapeid")
        for routename, group in shapesinfo.groupby("routeshortname"):
            layer = folium.FeatureGroup(name=f"Route {routename}", show=False)
            for shapeid, seg in group.groupby("shapeid"):
                seg = seg.sort_values("shape_pt_sequence")
                linepoints = list(zip(seg["shape_pt_lat"], seg["shape_pt_lon"]))
                color = f"#{seg.iloc[0]['routecolor']}" if pd.notnull(seg.iloc[0]['routecolor']) else "#000000"
                folium.PolyLine(
                    locations=linepoints,
                    color=color,
                    weight=4, opacity=0.9,
                    popup=f"<b>Route {routename}</b>"
                ).add_to(layer)
            layer.add_to(m)
        stopslayer = folium.FeatureGroup(name="All Bus Stops", show=True)
        for _, stop in stopsdf.iterrows():
            folium.CircleMarker(
                location=[stop["stop_lat"], stop["stop_lon"]],
                radius=2, color="#555555", weight=1,
                popup=f"<b>Stop:</b> {stop['stop_name']}",
            ).add_to(stopslayer)
        stopslayer.add_to(m)
        proposedroutelayer = folium.FeatureGroup(name="PROPOSED West Cary Connector", show=True)
        # Example stops for new route (replace with your data as needed)
        proposedstops = [
            {"name": "Parkside Town Commons", "lat": 35.820, "lon": -78.852},
            {"name": "Green Level West at Mills Park Dr", "lat": 35.815, "lon": -78.850},
            {"name": "Green Level West at Carpenter Fire Station Rd", "lat": 35.805, "lon": -78.845}
        ]
        linecoords = [(stop["lat"], stop["lon"]) for stop in proposedstops]
        folium.PolyLine(
            locations=linecoords, color="crimson", weight=5, opacity=0.8, dash_array="10,5",
            popup="Proposed Route 9 West Cary Connector"
        ).add_to(proposedroutelayer)
        for stop in proposedstops:
            folium.Marker(
                location=[stop["lat"], stop["lon"]],
                popup=f"<b>Proposed Stop:</b> {stop['name']}",
                icon=folium.Icon(color="green", icon="plus")
            ).add_to(proposedroutelayer)
        proposedroutelayer.add_to(m)
        folium.LayerControl().add_to(m)
        m.save(outputmapfile)
        log(f"--- Success! Interactive layered map saved to {outputmapfile} ---")
    except Exception as e:
        log(f"Error generating map: {e}")

# ===========================
# ML Models
# ===========================
def trainandevaluatemodel():
    log("--- Starting Machine Learning Model Training ---")
    analysisfile = os.path.join(DATADIRECTORY, "gocaryanalysis.csv")
    modeloutputfile = os.path.join(DATADIRECTORY, "busdelaymodel.joblib")
    if not os.path.exists(analysisfile):
        log(f"Error: Analysis file not found at {analysisfile}. Run analysis first.")
        return
    try:
        df = pd.read_csv(analysisfile).dropna(subset=["delayminutes", "routeshortname"])
        if df.empty:
            log("Error: No data after cleaning.")
            return
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        df["hour"] = df["timestamp"].dt.hour
        df["dayofweek"] = df["timestamp"].dt.dayofweek
        df = pd.get_dummies(df, columns=["routeshortname"], prefix="route")
        y = df["delayminutes"]
        featurecols = ["latitude", "longitude", "hour", "dayofweek"] + [col for col in df.columns if col.startswith("route")]
        X = df[featurecols]
        from sklearn.model_selection import train_test_split
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.metrics import mean_absolute_error
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
        model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, min_samples_leaf=5)
        model.fit(Xtrain, ytrain)
        predictions = model.predict(Xtest)
        mae = mean_absolute_error(ytest, predictions)
        log(f"Model Mean Absolute Error (MAE): {mae:.2f} minutes")
        joblib.dump(model, modeloutputfile)
        log(f"Model saved to {modeloutputfile}")
        # Example prediction
        sample = Xtest.iloc[0].copy()
        for col in sample.index:
            if "route" in col: sample[col] = 0
        if "route_6" in sample.index:
            sample["route_6"] = 1  # e.g., Route 6
        sample["hour"] = 15; sample["dayofweek"] = 1
        sampledf = pd.DataFrame([sample])
        pred = model.predict(sampledf)[0]
        log(f"Sample Prediction (Route 6, Tue 3 PM): {pred:.2f} minutes {'EARLY' if pred < -2 else 'LATE' if pred > 5 else 'ON TIME'}")
    except Exception as e:
        log(f"Error during ML model training: {e}")

def trainlightgbmmodel():
    log("--- Starting LightGBM Model Training ---")
    analysisfile = os.path.join(DATADIRECTORY, "gocaryanalysis.csv")
    modeloutputfile = os.path.join(DATADIRECTORY, "busdelaylightgbmmodel.joblib")
    if not os.path.exists(analysisfile):
        log(f"Error: Analysis file not found at {analysisfile}. Run analysis first.")
        return
    try:
        import lightgbm as lgb
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import mean_absolute_error, r2_score
        df = pd.read_csv(analysisfile).dropna(subset=["delayminutes", "routeshortname"])
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        df["hour"] = df["timestamp"].dt.hour
        df["minute"] = df["timestamp"].dt.minute
        df["dayofweek"] = df["timestamp"].dt.dayofweek
        df["timeofday_numeric"] = df["hour"] + df["minute"] / 60.0
        df["routeshortname"] = df["routeshortname"].astype("category")
        y = df["delayminutes"]
        X = df[["latitude", "longitude", "timeofday_numeric", "dayofweek", "routeshortname"]]
        Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
        lgbm = lgb.LGBMRegressor(random_state=42)
        lgbm.fit(Xtrain, ytrain, categorical_feature=['routeshortname'])
        predictions = lgbm.predict(Xtest)
        mae = mean_absolute_error(ytest, predictions)
        r2 = r2_score(ytest, predictions)
        log(f"LightGBM MAE: {mae:.2f} min, R-squared: {r2:.2f}")
        joblib.dump(lgbm, modeloutputfile)
        log(f"LightGBM model saved to {modeloutputfile}")
    except Exception as e:
        log(f"Error during LightGBM training: {e}")

# ===========================
# Main Block & Menu
# ===========================
if __name__ == "__main__":
    print("What would you like to do?")
    print("1. Collect Data Only (runs until you stop it)")
    print("2. Analyze Existing Data Only")
    print("3. Collect Data for a specific time, then Analyze")
    print("4. Visualize Analysis Results")
    print("5. Generate Layered Interactive Map")
    print("6. Train and Evaluate ML Model (Random Forest)")
    print("7. Train and Evaluate ML Model (LightGBM)")
    choice = input("Enter your choice (1-7): ")
    if choice == "1":
        rundatacollection(DATADIRECTORY)
    elif choice == "2":
        analyzeontimeperformance(DATADIRECTORY)
    elif choice == "3":
        try:
            duration = int(input("How many minutes do you want to collect data for? "))
            rundatacollection(DATADIRECTORY, durationminutes=duration)
            analyzeontimeperformance(DATADIRECTORY)
        except ValueError:
            log("Invalid input: Please enter a number.")
    elif choice == "4":
        visualizedata()
    elif choice == "5":
        plotlayeredmap()
    elif choice == "6":
        trainandevaluatemodel()
    elif choice == "7":
        trainlightgbmmodel()
    else:
        print("Invalid choice. Please enter a number between 1 and 7.")


What would you like to do?
1. Collect Data Only (runs until you stop it)
2. Analyze Existing Data Only
3. Collect Data for a specific time, then Analyze
4. Visualize Analysis Results
5. Generate Layered Interactive Map
6. Train and Evaluate ML Model (Random Forest)
7. Train and Evaluate ML Model (LightGBM)
