Step 1: Getting the data into Python, and cleaning it.
- will need to write code to import and clean, then functionalize it.

Steps to clean data:

*Remaining balance:
- need to remove $ and ,
- convert to integer
pandas already did it

*location:
city names contain misspellings and characters
state names are not all abbreviated
some zip codes have postal codes
- start by assigning all blank values as missing  
- pull only the first part of the zip into a new col
- if original zip col is not missing look zip up in zippopotamus to return city and state, 
- if original zip is missing or blank, look up city and state
- if city and state are missing, return missing

Language:
-correct language blanks to missing

*DOB:
-assign DOBs before today as NA

Marital status:
- assign blanks to missing

*Gender:
- assign blanks to missing

*Race:
- some values for white misspelled
- some values for American Indian misspelled
- if contains american indian, then American Indian or Alaska Native
 - if starts with W, then white
- assign blanks to missing

*Hispanic/Latino:
- some values mispelled
- non-hispanic or latino not consistent
- some values no
- assign blanks to missing
- Everything that starts with no should be assigned to non-hispanic or latino
- if doesn't start with no or is missing, decline to answer,  or non-hispanic, assign to Hispanic or Latino

*Sexual orientation:
- assign blanks and N/As to missing
- assign decline to "decline to answer"
- If it starts with st assign to straight

*Insurance type:
- if contains medicare or medicaid, assign to Medicare & Medicaid
- if starts with un then uninsured
- if missing or blank, assign missing

Household size:
- assign the row with 4602 to blank
- assign missings to blank?

*Household income:
- remove $ - and , assign as integers
- assign missings to blank
pandas already did it

*Distance round trip:
- take only numbers, assign text to missing

referral source:
- assign blanks to missing

*Amount:
 - take only numbers, remove $ - and ,
- assign blanks to missing

*payment method:
only text, assign responses with only numbers to missing

payable to:
Surely I don't have to do anything with this

*patient letter notified:
- assign na, n/a, missing, and blanks to No
- assign dates to Y

Application signed:
should be fine.

other:
make sure data types align with what's needed
- use type "object" to handle numerical and non-numerical data?
Bulleted cols need to be cleaned



In [None]:
import pandas as pd
import numpy as np
import requests
import os
import sys
import re

#function to fetch zip codes
def fetch_zip_info(zip_codes):
    """Fetch city, state, latitude, and longitude for zip codes."""
    zip_to_locale = {}
    for zip_code in zip_codes:
        try:
            url = f"https://api.zippopotam.us/us/{zip_code}"
            response = requests.get(url)
            if response.status_code == 200:
                zip_data = response.json()
                place = zip_data['places'][0]
                city = place['place name']
                state = place['state abbreviation']
                latitude = float(place['latitude'])
                longitude = float(place['longitude'])
                zip_to_locale[zip_code] = {
                    'City': city,
                    'State': state,
                    'Latitude': latitude,
                    'Longitude': longitude
                }
            else:
                zip_to_locale[zip_code] = {
                    'City': 'Unknown',
                    'State': 'Unknown',
                    'Latitude': None,
                    'Longitude': None
                }
        except Exception:
            zip_to_locale[zip_code] = {
                'City': 'Error',
                'State': 'Error',
                'Latitude': None,
                'Longitude': None
            }
    return zip_to_locale

def clean_data(filepath, sheet_name = None):
    """Clean the service learning data."""
    today = pd.Timestamp.today()

    # Read file (Excel or CSV)
    if filepath.endswith('.xlsx'):
        # Safely read Excel with specified sheet name
        xl = pd.ExcelFile(filepath)
        if sheet_name and sheet_name not in xl.sheet_names:
            raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {xl.sheet_names}")
        data = xl.parse(sheet_name) if sheet_name else xl.parse(xl.sheet_names[0])
    elif filepath.endswith('.csv'):
        data = pd.read_csv(filepath)
    else:
        raise ValueError("Unsupported file format: Only .csv or .xlsx allowed.")

    # Clean Payment Submitted Date
    # Duplicate Payment Submitted?
    data['Payment Submitted Date'] = data['Payment Submitted?']
    # Convert to datetime and fill non-dates with NaT
    data['Payment Submitted Date'] = pd.to_datetime(data['Payment Submitted Date'], errors='coerce')
    # Ensure the column is explicitly set to 'Yes' if a valid date was found
    data['Payment Submitted?'] = data['Payment Submitted?'].apply(
    lambda x: x if pd.to_datetime(x, errors='coerce') is pd.NaT else 'Yes')

    #Grant Request Date
    data["Grant Req Date"] = pd.to_datetime(data["Grant Req Date"]).dt.strftime('%m/%d/%Y')

    # Clean Zip, City, State
    data['Pt Zip'] = data['Pt Zip'].astype(str).str.strip().str.extract(r'(\d{5})')[0]
    data['Pt Zip'] = data['Pt Zip'].fillna("Missing")
    data.loc[data['Pt Zip'] == "Missing", ['Pt City', 'Pt State']] = "Missing"

    valid_zips = data[data['Pt Zip'] != "Missing"]['Pt Zip'].unique()
    zip_to_locale = fetch_zip_info(valid_zips)

    # Add City, State, Latitude, Longitude
    data['Pt City'] = data['Pt Zip'].apply(lambda z: zip_to_locale.get(z, {}).get('City', 'Missing'))
    data['Pt State'] = data['Pt Zip'].apply(lambda z: zip_to_locale.get(z, {}).get('State', 'Missing'))
    data['Latitude'] = data['Pt Zip'].apply(lambda z: zip_to_locale.get(z, {}).get('Latitude'))
    data['Longitude'] = data['Pt Zip'].apply(lambda z: zip_to_locale.get(z, {}).get('Longitude'))

    # Clean DOB
    data['DOB'] = pd.to_datetime(data['DOB'], errors='coerce')
    data.loc[data['DOB'] > today, 'DOB'] = pd.NaT
 
    # Clean Gender
    data['Gender'] = data['Gender'].replace(r'^\s*$', "Missing", regex=True)

    # Clean Race
    data['Race'] = data['Race'].astype(str).str.strip().str.lower()
    data['Race'] = data['Race'].apply(lambda x: (
        'American Indian or Alaska Native' if 'american indian' in x else
        'White' if x.startswith('w') else
        "Missing" if x in ['', 'nan'] else x.title()
    ))

    # Clean Hispanic/Latino
    data['Hispanic/Latino'] = data['Hispanic/Latino'].astype(str).str.strip().str.lower()
    data['Hispanic/Latino'] = data['Hispanic/Latino'].apply(lambda x: (
        'Non-Hispanic or Latino' if x.startswith('no') else
        'Hispanic or Latino' if not (x.startswith('no') or x in ['nan', '', 'missing', 'decline to answer', 'non-hispanic']) else
        np.nan
    ))

    # Clean Sexual Orientation
    data['Sexual Orientation'] = data['Sexual Orientation'].astype(str).str.strip().str.lower()
    data['Sexual Orientation'] = data['Sexual Orientation'].apply(lambda x: (
        'Decline to answer' if x == 'decline' else
        'Straight' if x.startswith('st') else
        np.nan if x in ['n/a', '', 'nan'] else x.title()
    ))

    # Clean Insurance Type
    data['Insurance Type'] = data['Insurance Type'].astype(str).str.strip().str.lower()
    data['Insurance Type'] = data['Insurance Type'].apply(lambda x: (
        'Medicare & Medicaid' if 'medicare' in x or 'medicaid' in x else
        'Uninsured' if x.startswith('un') else
        'Missing' if x in ['', 'nan'] else
        x.title()
    ))

    # Marital Status, Gender, Hispanic/Latino, Sexual Orientation blanks to "Missing"
    for col in ['Marital Status', 'Gender', 'Hispanic/Latino', 'Sexual Orientation']:
        data[col] = data[col].astype(str).str.strip().replace(r'^\s*$', 'Missing', regex=True).replace('nan', 'Missing')

    # Sexual Orientation further normalization
    data['Sexual Orientation'] = data['Sexual Orientation'].str.lower().apply(lambda x: (
        'Decline to answer' if x == 'decline' else
        'Straight' if x.startswith('st') else
        x.title()
    ))

    # Clean HouseHold Size
    data['Household Size'] = pd.to_numeric(data['Household Size'], errors='coerce')
    data.loc[(data['Household Size'] > 20) | (data['Household Size'].isna()), 'Household Size'] = np.nan

    # Clean Total Household Gross Monthly Income
    data['Total Household Gross Monthly Income'] = (
        data['Total Household Gross Monthly Income']
        .astype(str).str.replace(r'[^\d.]', '', regex=True)
    )
    data['Total Household Gross Monthly Income'] = pd.to_numeric(data['Total Household Gross Monthly Income'], errors='coerce')

    # Clean Distance roundtrip
    data['Distance roundtrip/Tx'] = pd.to_numeric(
        data['Distance roundtrip/Tx'].astype(str).str.extract(r'(\d+\.?\d*)')[0],
        errors='coerce'
    )

    # Clean Referral Source
    data['Referral Source'] = data['Referral Source'].astype(str).str.strip().replace(r'^\s*$', 'Missing', regex=True)

    # Clean Payment Method
    data['Payment Method'] = data['Payment Method'].astype(str).str.strip().replace(r'^\s*$', 'Missing', regex=True).replace('nan', 'Missing')
     # Strip everything except letters
    data['Payment Method'] = data['Payment Method'].str.replace(r'[^a-zA-Z\s]','', regex=True).str.strip()
    #Uppercase
    data['Payment Method'] = data['Payment Method'].str.upper()

    # Clean Remaining Balance
    data['Remaining Balance'] = pd.to_numeric(data['Remaining Balance'], errors='coerce').round(2)
    #Clean Amount
    data['Amount'] = pd.to_numeric(data['Amount'], errors='coerce').round(2)

    # Clean Patient Letter Notified
    def letter_notified(val):
        val = str(val).strip().lower()
        if val in ['na', 'n/a', 'missing', '', 'nan']:
            return 'No'
        try:
            pd.to_datetime(val)
            return 'Yes'
        except:
            return 'No'
    
    #create date notified
    data['Date Notified'] = data['Patient Letter Notified? (Directly/Indirectly through rep)']
    # Convert to datetime and fill non-dates with NaT
    data['Date Notified'] = pd.to_datetime(data['Date Notified'], errors='coerce')
    #cleaning Patiend Letter Notified
    data['Patient Letter Notified? (Directly/Indirectly through rep)'] = data['Patient Letter Notified? (Directly/Indirectly through rep)'].apply(letter_notified)
       
    #Clean Payable to:
    # Replace blanks or whitespace-only strings with "Missing"
    data['Payable to:'] = data['Payable to:'].replace(r'^\s*$', "Missing", regex=True)

    #Clean Application Signed?
    data['Application Signed?'] = data['Application Signed?'].replace(np.nan, "Missing", regex=True)
    data['Application Signed?'] = data['Application Signed?'].str.upper()

    # Clean Type of Assistance
    data['Type of Assistance (CLASS)'] = data['Type of Assistance (CLASS)'].astype(str).str.strip().str.lower()
    data['Type of Assistance (CLASS)'] = data['Type of Assistance (CLASS)'].apply(
        lambda x: 'utilities' if x.startswith('u') else x
    )
    data['Type of Assistance (CLASS)'] = data['Type of Assistance (CLASS)'].str.title()

    # Clean Marital Status
    data['Marital Status'] = data['Marital Status'].astype(str).str.strip().str.lower().str.title()
    data['Marital Status'] = data['Marital Status'].apply(lambda x: 'Seperated' if x.startswith('Se') else x)

    # Clean Gender
    data['Gender'] = data['Gender'].astype(str).str.strip().str.lower().str.title()

    # Export cleaned data for testing
    output_path = r"C:\Users\Glen\Documents\ToolsForDataAnalysis\SemesterProject\cleaned_data.csv"
    data.to_csv(output_path, index=False)

    #Export cleaned data as part of github action
    #output_path = os.path.join("output", "cleaned_data.csv")
    #os.makedirs(os.path.dirname(output_path), exist_ok=True)
    #data.to_csv(output_path, index=False)
    
    return data

In [3]:
#Testing function
clean_data("C:\\Users\\Glen\\Documents\\ToolsForDataAnalysis\\SemesterProject\\Raw Data and Dict\\UNO Service Learning Data Sheet De-Identified Version.xlsx")



Unnamed: 0,Patient ID#,Grant Req Date,App Year,Remaining Balance,Request Status,Payment Submitted?,Reason - Pending/No,Pt City,Pt State,Pt Zip,Language,DOB,Marital Status,Gender,Race,Hispanic/Latino,Sexual Orientation,Insurance Type,Household Size,Total Household Gross Monthly Income,Distance roundtrip/Tx,Referral Source,Referred By:,Type of Assistance (CLASS),Amount,Payment Method,Payable to:,Patient Letter Notified? (Directly/Indirectly through rep),Application Signed?,Notes,Payment Submitted Date,Latitude,Longitude,Date Notified
0,180001,10/17/2018,1,1180.00,Approved,Yes,,Missing,Missing,Missing,Missing,NaT,Missing,Missing,Missing,Missing,Missing,Missing,,,,NCS,Dr. Natarajan/Lily Salinas,Medical Supplies/Prescription Co-Pay(S),320.00,MISSING,Missing,No,MISSING,,NaT,,,NaT
1,190001,01/03/2019,1,1428.39,Approved,Yes,,Missing,Missing,Missing,Missing,NaT,Missing,Missing,Missing,Missing,Missing,Missing,,,,NCS,Pam Owen/Sheri Shannon\n,Medical Supplies/Prescription Co-Pay(S),21.61,MISSING,Missing,No,MISSING,,NaT,,,NaT
2,190001,03/11/2019,1,1428.39,Approved,Yes,,Missing,Missing,Missing,Missing,NaT,Missing,Missing,Missing,Missing,Missing,Missing,,,,NCS,Teresa Pfister,Food/Groceries,50.00,GC,Missing,No,MISSING,,NaT,,,NaT
3,190002,05/20/2019,1,1400.00,Approved,Yes,,Missing,Missing,Missing,Missing,NaT,Missing,Missing,Missing,Missing,Missing,Missing,,,,NCS,AG/Susan Keith,Food/Groceries,100.00,GC,Missing,No,MISSING,,NaT,,,NaT
4,190003,05/22/2019,1,1425.00,Approved,Yes,,Missing,Missing,Missing,Missing,NaT,Missing,Missing,Missing,Missing,Missing,Missing,,,,NCS,AG/Kristi McHugh,Medical Supplies/Prescription Co-Pay(S),75.00,CC,Missing,No,MISSING,,NaT,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2287,240393,01/31/2025,2,1000.00,Pending,No,HS,Falls City,NE,68355,English,1960-09-23,Widowed,Female,Asian,Non-Hispanic or Latino,Straight,Uninsured,1.0,4000.0,100.0,CPN,,Gas,500.00,MISSING,,No,MISSING,Waiting on HS,NaT,40.0742,-95.5931,NaT
2288,240393,01/31/2025,2,1000.00,Pending,No,HS,Falls City,NE,68355,English,1960-09-23,Widowed,Female,Asian,Non-Hispanic or Latino,Straight,Uninsured,1.0,4000.0,100.0,CPN,,Food/Groceries,500.00,MISSING,,No,MISSING,Waiting on HS,NaT,40.0742,-95.5931,NaT
2289,240548,01/31/2025,2,1000.00,Pending,No,,Fremont,NE,68025,English,1962-04-03,Married,Male,White,Non-Hispanic or Latino,Straight,Private,2.0,2895.0,15.0,NCS,ALISA SEIDLER,Multiple,1068.56,MISSING,,No,MISSING,,NaT,41.4416,-96.4945,NaT
2290,250038,01/31/2025,1,1500.00,Pending,No,,Hastings,NE,68901,Spanish,1980-10-02,Single,Female,Other,Hispanic or Latino,Straight,Uninsured,2.0,918.0,2.0,Morrison Cancer Center,Kellie Sterkel-SW,Housing,1500.00,MISSING,,No,MISSING,,NaT,40.5877,-98.3911,NaT


In [None]:

def main():
    if len(sys.argv) < 2:
        raise ValueError("No input file provided. Usage: python clean_data_script.py <input_file>")

    input_file = sys.argv[1]
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file '{input_file}' not found.")

    # Determine output file
    output_file = os.path.splitext(input_file)[0] + "_CLEANED.csv"
    sheet_name = "PA Log Sheet" if input_file.endswith(".xlsx") else None

    print(f"Reading from: {input_file}")
    cleaned_df = clean_data(input_file, sheet_name=sheet_name)

    print(f"Saving cleaned data to: {output_file}")
    cleaned_df.to_csv(output_file, index=False)
    print(f"✅ Cleaning completed: {input_file} -> {output_file}")

if __name__ == "__main__":
    main()

GitHub action steps

In [None]:
name: Clean New Data

permissions:
  contents: write

on:
  push:
    branches: [ "main" ]
    paths:
      - '**/*.csv'
      - '**/*.xlsx'

jobs:
  clean-data:
    runs-on: ubuntu-latest

    steps:
    - name: Checkout code
      uses: actions/checkout@v4
      with:
        fetch-depth: 2

    - name: Set up Python
      uses: actions/setup-python@v5
      with:
        python-version: '3.x'

    - name: Install dependencies
      run: |
        pip install pandas numpy requests openpyxl

    - name: Detect changed file
      id: detect_file
      run: |
        changed_file=$(git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep -E '\.csv$|\.xlsx$' | head -n 1 || true)
        echo "Changed file: $changed_file"
        echo "CHANGED_FILE=$changed_file" >> $GITHUB_ENV
  
    - name: Run data cleaning script
      if: env.CHANGED_FILE != ''
      run: |
        echo "Cleaning file: ${{ env.CHANGED_FILE }}"
        python clean_data_script.py "${{ env.CHANGED_FILE }}"

    - name: Commit cleaned data
      run: |
        git config --global user.name 'github-actions[bot]'
        git config --global user.email 'github-actions[bot]@users.noreply.github.com'
        git add *_CLEANED.csv
        git commit -m "Automated: Cleaned $CHANGED_FILE"
        git push
      continue-on-error: true

Creating Dashboard

In [None]:
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import glob
import os
from datetime import timedelta, datetime

#for use in github
# Find all CSV files ending in _CLEANED.csv in the current directory
#cleaned_files = glob.glob("*_CLEANED.csv")
#Print which files are being read
#print(f"Found cleaned files: {cleaned_files}")
# Combine them into a single DataFrame
#df_list = [pd.read_csv(f) for f in cleaned_files]
#db_data = pd.concat(df_list, ignore_index=True)

#load data
db_data = pd.read_csv("C:\\Users\\Glen\\Documents\\ToolsForDataAnalysis\\SemesterProject\\cleaned_data.csv")
#converting grant req date to datetime
db_data["Grant Req Date"] = pd.to_datetime(db_data["Grant Req Date"])
#creating new column for assitance duration


st.set_page_config(
    page_title = "Nebraska Cancer Specialists Hope Foundation Dashboard",
    layout = "wide")
st.title("Nebraska Cancer Specialists Hope Foundation Dashboard")

st.logo(image="https://www.bricksrus.com/donorsite/images/logo-NCSHF.png", 
        icon_image="https://ncshopefoundation.org/wp-content/uploads/2023/05/sun.webp")

#Page navigation
with st.sidebar:
     page = st.radio("Select a Page", ["Pending Applications", "Assistance Given by Demographics", "Assistance Delivery Duration", "Grant Utilization", "Executive Impact Summary"])
     max_date = db_data['Grant Req Date'].max().date()
     min_date = db_data['Grant Req Date'].min().date()
     default_start_date = min_date  # Show all time by default
     default_end_date = max_date
     start_date = st.date_input("Start date", default_start_date, min_value=db_data['Grant Req Date'].min().date(), max_value=max_date)
     end_date = st.date_input("End date", default_end_date, min_value=db_data['Grant Req Date'].min().date(), max_value=max_date)

#filtering db data based on user selection
db_data = db_data[(db_data['Grant Req Date'].dt.date >= start_date) & (db_data['Grant Req Date'].dt.date <= end_date)]

# Pending Applications Page - page 1
if page == "Pending Applications":
    st.header("Pending Applications")
    # Filter by "Application Signed?"
    signature_options = ["All"] + list(pd.unique(db_data["Application Signed?"]))
    signed_filter = st.selectbox("Filter by Signature Status:", signature_options)
    if signed_filter != "All":
        db_data = db_data[db_data["Application Signed?"] == signed_filter]
    
    #page 1 dataframe
    db_data_pending = db_data[db_data["Request Status"] == "Pending"]
    #create page 1 card dfs
    pending_apps = db_data_pending["Patient ID#"].nunique()
    signed_apps = db_data_pending[db_data_pending["Application Signed?"] == "YES"]["Patient ID#"].nunique()
    missing_apps = db_data_pending[db_data_pending["Application Signed?"] == "MISSING"]["Patient ID#"].nunique()
    unsigned_apps = db_data_pending[db_data_pending["Application Signed?"] == "NO"]["Patient ID#"].nunique()
    total_amt_pending = db_data_pending["Amount"].sum().round()
    #create page 1 cards
    kpi1, kpi2, kpi3, kpi4, kpi5 = st.columns(5)
    kpi1.metric(label = "Pending Applications", value = pending_apps)
    kpi2.metric(label = "Signed Pending Applications", value = signed_apps)
    kpi3.metric(label = "Pending Unsigned Applications", value = unsigned_apps)
    kpi4.metric(label = "Pending Applications Missing Signature Status", value = missing_apps)
    kpi5.metric(label = "Pending Amount Requested", value = f"${total_amt_pending}")
    
    #Dataframe filtered to only pending applications
    pg1_df = db_data[db_data["Request Status"] == "Pending"][
        ["Patient ID#", "Grant Req Date", "Application Signed?", "Pt Zip", "Insurance Type", 
         "Total Household Gross Monthly Income", "Type of Assistance (CLASS)", "Amount", 
         "Referral Source", "Notes"]]
    
    #grouping assitance type by amount
    p1_bar_data = db_data_pending.groupby("Type of Assistance (CLASS)")["Amount"].sum().reset_index()
    #streamlit barchart for assitance type by amount
    p1_bar = st.bar_chart(data = p1_bar_data, x="Type of Assistance (CLASS)", y="Amount", x_label = "Assistance Type", y_label = "Amount Requested", horizontal = False)
    #pending applications table
    pg1 = st.data_editor(pg1_df)
    
#Assistance amount by demographic factors - page 2
elif page == "Assistance Given by Demographics":
    #page header
    st.header("Assistance Given by Demographics")
    #list of demographic factors
    demo = [
        'Gender', 'State', 'Zip Code', 'Hispanic or Latino', 'Sexuality', 'Race', 'Insurance Type', 'Household Gross Monthly Income', 'Marital Status', 'Household Size', 'Age']

    #demographics selectbox
    demo_select = st.selectbox("Select Demographic", demo)

    # filter & display data based on the selected demographic
    if demo_select == "Gender":
        # sum assistance by gender
        gender_assistance = db_data.groupby("Gender")["Amount"].sum()  
        st.bar_chart(gender_assistance)
        st.write(gender_assistance)

    elif demo_select == "Insurance Type":
        insurance_assistance = db_data.groupby("Insurance Type")["Amount"].sum()  
        st.bar_chart(insurance_assistance)
        st.write(insurance_assistance)

    elif demo_select == "Sexuality":
        sexuality_assistance = db_data.groupby("Sexual Orientation")["Amount"].sum()  
        st.bar_chart(sexuality_assistance)
        st.write(sexuality_assistance)

    elif demo_select == "Race":
        racial_assistance = db_data.groupby("Race")["Amount"].sum()
        st.bar_chart(racial_assistance)
        st.write(racial_assistance)

    elif demo_select == "Hispanic or Latino":
        ethnicity_assistance = db_data.groupby("Hispanic/Latino")["Amount"].sum()  
        st.bar_chart(ethnicity_assistance)
        st.write(ethnicity_assistance)

    elif demo_select == 'State':
        state_assistance = db_data.groupby("Pt State")["Amount"].sum()
        st.bar_chart(state_assistance)
        st.write(state_assistance)

    elif demo_select == "Zip Code":
        st.header("Assistance by Zip Code")

        # Aggregate the total Amount by Zip Code
        zip_code_assistance = db_data.groupby("Pt Zip")["Amount"].sum()

        #map data
        map_data = db_data[['Latitude', 'Longitude', 'Amount', 'Pt Zip']]
        # Ensure the 'Amount' column is numeric
        map_data["Amount"] = pd.to_numeric(map_data["Amount"], errors="coerce")

        # Drop rows without valid zip codes or amount
        map_data = map_data.dropna(subset=["Pt Zip", "Amount", "Latitude", "Longitude"])

        # Create a scatter plot map
        fig = px.scatter_geo(
            map_data, lat="Latitude", lon="Longitude", color="Amount", hover_name="Pt Zip", hover_data=["Amount"], color_continuous_scale="Viridis", projection="albers usa", title="Assistance Amounts by Zip Code",)

        # Update map settings for better visualization
        fig.update_geos(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="lightgray")
        fig.update_layout(
            geo=dict( projection_type="albers usa", showland=True, landcolor="lightgray", subunitcolor="gray",),
            title_text="Assistance Amounts by Zip Code", coloraxis_colorbar_title="Assistance Amount")
        
        # Display in Streamlit
        st.plotly_chart(fig)
        
        # Show the assistance by Zip Code table
        st.write(zip_code_assistance)

    elif demo_select == "Marital Status":
        marriage_assistance = db_data.groupby('Marital Status')['Amount'].sum()
        st.bar_chart(marriage_assistance)
        st.write(marriage_assistance)

    elif demo_select == "Household Size":
        householdsize_assistance = db_data.groupby('Household Size')['Amount'].sum()
        st.bar_chart(householdsize_assistance)
        st.write(householdsize_assistance) 

#Time to Support - page 3
elif page == "Assistance Delivery Duration":
    st.header("Assistance Delivery Duration")
    #convert request date and payment submitted date to datetime
    db_data["Payment Submitted Date"] = pd.to_datetime(db_data["Payment Submitted Date"], errors='coerce')
    db_data["Grant Req Date"] = pd.to_datetime(db_data["Grant Req Date"], errors='coerce')
    #date difference to get days to assist
    db_data["Time to Assistance"] = (db_data["Payment Submitted Date"] - db_data["Grant Req Date"]).dt.days.round(2)
    #convert results to numeric
    db_data["Time to Assistance"] = pd.to_numeric(db_data["Time to Assistance"])
    #card for average
    kpi9_data = db_data["Time to Assistance"].dropna().mean().round(2)
    kpi9 = st.columns(1)
    kpi9[0].metric(label = "Average Assistance Delivery Duration", value = kpi9_data)
    #histogram of duration
    st.bar_chart(db_data["Time to Assistance"].value_counts().sort_index())

elif page == "Grant Utilization":
    ug_db = db_data[db_data["Remaining Balance"]>0]
    #number of underutilized grants
    underutilized_grants = ug_db["Patient ID#"].nunique()
    #card for underutilized grants
    kpi10 = st.columns(1)
    kpi10[0].metric(label = "Number of Underutlized Grants", value = underutilized_grants)
    #bar chart of underutilization by assistance type
    ug_db_grouped = ug_db.groupby("Type of Assistance (CLASS)")["Amount"].sum().reset_index()
    st.bar_chart(data = ug_db_grouped, x="Type of Assistance (CLASS)", y="Amount", x_label = "Assistance Type", y_label = "Amount Requested", horizontal = False)


#Executive Impact Summary Page - page 5
elif page == "Executive Impact Summary":
    st.header("Exective Impact Summary")
    #page 4 dataframe
    pg4_df = db_data[db_data["Request Status"] == "Approved"]
    pg4_df["City, State"] = pg4_df["Pt City"] + " , " + pg4_df["Pt State"] .fillna('')
    #page 4 card dataframes
    total_amt_awarded_df= pg4_df["Amount"].sum().round()
    total_applicants_awarded_df = pg4_df["Patient ID#"].nunique()
    total_amount = pg4_df["Amount"].sum()
    num_unique_patients = pg4_df["Patient ID#"].nunique()
    avg_award = (total_amount / num_unique_patients).round(2)
    #page 4 cards
    kpi6, kpi7, kpi8 = st.columns(3)
    kpi6.metric(label = "Total Assistance Awarded", value = f"${total_amt_awarded_df}")
    kpi7.metric(label = "Total Applicants Awarded", value = total_applicants_awarded_df)
    kpi8.metric(label = "Average Assistance Amount per Patient", value = f"${avg_award}")

    #bar graph of contribution by insurance type
    p4_bar1_data = pg4_df.groupby("Insurance Type")["Amount"].sum().reset_index()
    st.subheader("Assistance Given by Insurance Type")
    p4_bar1 = st.bar_chart(data = p4_bar1_data, x = "Insurance Type", y = "Amount")

    #average amount by assistance type
    pg4_df_grouped_avg = pg4_df.groupby("Type of Assistance (CLASS)")["Amount"].mean().reset_index()
    st.subheader("Average Assistance Amount by Type")
    st.bar_chart(data = pg4_df_grouped_avg, x = "Type of Assistance (CLASS)", y = "Amount", x_label = "Assistance Type", y_label = "Amount Approved")
    
    #bar graph of contribution by assistance type
    p4_bar2_data = pg4_df.groupby("Type of Assistance (CLASS)")["Amount"].sum().reset_index()
    st.subheader("Total Assistance by Type")
    p4_bar2 = st.bar_chart(data = p4_bar2_data, x = "Type of Assistance (CLASS)", y = "Amount", x_label = "Assistance Type", y_label = "Amount Approved" )
    
    #bar graph of contribution by top city, state
    p4_bar3_data = pg4_df.groupby("City, State")["Amount"].sum().reset_index().sort_values(by = "Amount", ascending = False).head(20)
    p4_bar3_data = p4_bar3_data.sort_values(by="Amount", ascending=True)
    st.subheader("Total Assistance by Top 20 City and State")
    p4_bar3 = st.bar_chart(data = p4_bar3_data, x = "City, State", y = "Amount")
    
    #bar graph of contribution by bottom city, state
    p4_bar4_data = pg4_df.groupby("City, State")["Amount"].sum().reset_index().sort_values(by = "Amount", ascending = False).tail(20)
    p4_bar4_data = p4_bar4_data.sort_values(by="Amount", ascending=True)
    st.subheader("Total Assistance by Bottom 20 City and State")
    p4_bar4 = st.bar_chart(data = p4_bar4_data, x = "City, State", y = "Amount")