## If there are service level disparities, are there differences in the racial characteristics of the people most impacted?

This section looks to help expand on our third question: 
- Are there disparities in the service levels of different routes (which lines are late more often than others)? 

To do so we will be looking at racial data from the [2020 Census Tracts in Boston](https://data.boston.gov/dataset/2020-census-tracts-in-boston)

In [2]:
import pandas as pd

In [5]:
# census raw data
file_path = "data/census-tract-data.csv"  # Update this if needed
df = pd.read_csv(file_path)

print("Original Columns:", df.columns.tolist())
df.columns = df.columns.str.lower()


# rename columns
column_mapping = {
    "geoid": "Census_Tract_ID",
    "geocode": "Tract_Code",
    "tract": "Tract_Number",
    "p0020001": "Total_Population",
    "p0020002": "White_Population",
    "p0020005": "Black_Population",
    "p0020006": "Native_Population",
    "p0020007": "Asian_Population",
    "p0020008": "Pacific_Population",
    "p0020009": "Other_Race_Population",
    "p0020010": "Two_or_More_Races",
    "p0040002": "Hispanic_Population",
    "h0010001": "Total_Housing_Units",
    "h0010002": "Occupied_Housing_Units",
    "h0010003": "Vacant_Housing_Units"
}

# check for actual existing columns
existing_columns = {col: new_col for col, new_col in column_mapping.items() if col in df.columns}
df = df.rename(columns=existing_columns)

print("Renamed Columns:", df.columns.tolist())

# drop unecessary columns
drop_columns = ["fileid", "stusab", "sumlev", "region", "division", "state", "county", "cousub"]
df = df.drop(columns=[col for col in drop_columns if col in df.columns], errors="ignore")

# convert numeric columns to useful names
numeric_cols = list(existing_columns.values())
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

# handle missing data columns
if "Total_Population" in df.columns:
    df = df.dropna(subset=["Total_Population"])
else:
    print("'Total_Population' column not found.")

# calculate demographic percentages
race_columns = []
if "White_Population" in df.columns:
    df["Percent_White"] = (df["White_Population"] / df["Total_Population"]) * 100
    race_columns.append("Percent_White")
if "Black_Population" in df.columns:
    df["Percent_Black"] = (df["Black_Population"] / df["Total_Population"]) * 100
    race_columns.append("Percent_Black")
if "Hispanic_Population" in df.columns:
    df["Percent_Hispanic"] = (df["Hispanic_Population"] / df["Total_Population"]) * 100
    race_columns.append("Percent_Hispanic")
if "Asian_Population" in df.columns:
    df["Percent_Asian"] = (df["Asian_Population"] / df["Total_Population"]) * 100
    race_columns.append("Percent_Asian")
if all(x in df.columns for x in ["Native_Population", "Pacific_Population", "Other_Race_Population", "Two_or_More_Races"]):
    df["Percent_Other"] = ((df["Native_Population"] + df["Pacific_Population"] +
                            df["Other_Race_Population"] + df["Two_or_More_Races"]) / df["Total_Population"]) * 100
    race_columns.append("Percent_Other")

df.fillna(0, inplace=True)

# normalize percentages and remove extremely low populations
if len(race_columns) > 0:
    df["Total_Percent"] = df[race_columns].sum(axis=1)
    df.loc[df["Total_Percent"] > 100, race_columns] = df.loc[df["Total_Percent"] > 100, race_columns].div(df["Total_Percent"], axis=0) * 100
    df = df.drop(columns=["Total_Percent"])

# 
df = df[df["Total_Population"] >= 10]

df[race_columns] = df[race_columns].round(2)

# save cleaned data
output_file = "cleaned_2020_census_data.csv"
df.to_csv(output_file, index=False)

print(f"Cleaned census data saved as '{output_file}'.")

Original Columns: ['FILEID', 'STUSAB', 'SUMLEV', 'GEOID', 'GEOCODE', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'COUSUB', 'TRACT', 'P0020001', 'P0020005', 'P0020006', 'P0020002', 'P0020008', 'P0020007', 'P0020009', 'P0020010', 'P0020011', 'P0040001', 'P0040005', 'P0040006', 'P0040002', 'P0040008', 'P0040007', 'P0040009', 'P0040010', 'P0040011', 'P0050001', 'P0050002', 'P0050003', 'P0050004', 'P0050005', 'P0050006', 'P0050007', 'P0050008', 'P0050009', 'P0050010', 'H0010001', 'H0010002', 'H0010003']
Renamed Columns: ['fileid', 'stusab', 'sumlev', 'Census_Tract_ID', 'Tract_Code', 'region', 'division', 'state', 'county', 'cousub', 'Tract_Number', 'Total_Population', 'Black_Population', 'Native_Population', 'White_Population', 'Pacific_Population', 'Asian_Population', 'Other_Race_Population', 'Two_or_More_Races', 'p0020011', 'p0040001', 'p0040005', 'p0040006', 'Hispanic_Population', 'p0040008', 'p0040007', 'p0040009', 'p0040010', 'p0040011', 'p0050001', 'p0050002', 'p0050003', 'p0050004', 'p0

In [None]:
import os
import asyncio
import time
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pyppeteer

# Set the environment variable early
os.environ["PYPPETEER_CHROME_EXECUTABLE"] = r"C:\Program Files\Google\Chrome\Application\chrome.exe"

# Monkey-patch pyppeteer.launch to force the executablePath parameter
_original_launch = pyppeteer.launch
async def patched_launch(*args, **kwargs):
    kwargs['executablePath'] = os.environ["PYPPETEER_CHROME_EXECUTABLE"]
    return await _original_launch(*args, **kwargs)
pyppeteer.launch = patched_launch

url = "https://www.ctps.org/dv/mbtasurvey2018/index.html#navButton"
session = HTMLSession()
response = session.get(url)

# Render the page and simulate clicking the different tabs.
click_script = "document.getElementById('oth_demo').click();" # Change this to the appropriate ID for the tab you want to click
response.html.render(script=click_script, timeout=20)

# Wait a bit for the fare data to load after the click
time.sleep(20) # Needed to change to 20 to load everything

# Save HTML for debugging
with open("rendered_tab.html", "w", encoding="utf-8") as f:
    f.write(response.html.html)

soup = BeautifulSoup(response.html.html, 'html.parser')
text_elements = soup.select("text.chartNum")
print("Found", len(text_elements), "text elements with class 'chartNum'")

data = []
for text_el in text_elements:
    classes = text_el.get("class", [])
    route = None
    # Look for the class that starts with 'r' (excluding "chartNum")
    for cls in classes:
        if cls != "chartNum" and cls.startswith("r"):
            route = cls[1:]  # Remove the "r" prefix
            break
    if route:
        percent = text_el.get_text(strip=True)
        data.append((route, percent))

# Sort the data
data_sorted = sorted(data, key=lambda tup: (0 if "line-all" in tup[0].lower() else 1, tup[0]))

# Write the sorted data
output_filename = "other_demographics_data.txt" # Change the filename as necessary
with open(output_filename, "w", encoding="utf-8") as file:
    for route, percent in data_sorted:
        file.write(f"Route {route}: {percent}\n")

print(f"Fare data saved to {output_filename}")