In [1]:
import pandas as pd
import requests
import time
import googlemaps
import concurrent.futures
import io


In [2]:
#This whole cell just sets up our functions to handle various api calls

# Set up Google Maps API client
def load_api_key(filepath):
    """Reads API key from a text file."""
    with open(filepath, "r") as file:
        return file.read().strip()

API_KEY = load_api_key("../../Google_API_Key.txt")  #Replace with your API key (not provided in git repo)
gmaps = googlemaps.Client(key=API_KEY)

def get_location_details_google(row):
    """Fetch latitude, longitude, full street address, ZIP code, and county from Google Maps API."""
    
    if "District Name" in row:
        query = f"{row['School Name']}, {row['District Name']}, {state}, High School"
    else:
        query = f"{row['School Name']}, {state}, High School"

    
    try:
        geocode_result = gmaps.geocode(query)
        if geocode_result:
            location = geocode_result[0]["geometry"]["location"]
            lat, lon = location["lat"], location["lng"]
            
            
            # Extract address components
            address_components = geocode_result[0].get("address_components", [])
            street_number, street_name, zip_code, county, city = "", "", "N/A", "N/A", "N/A"
            
            for component in address_components:
                types = component["types"]
                if "street_number" in types:
                    street_number = component["long_name"]
                if "route" in types:
                    street_name = component["long_name"]
                if "postal_code" in types:
                    zip_code = component["long_name"]
                if "administrative_area_level_2" in types:
                    county = component["long_name"]
                if "locality" in types:  # This is for city
                    city = component["long_name"]
            
                    
            full_street_address = f"{street_number} {street_name}".strip()
            return pd.Series([lat, lon, full_street_address,  zip_code, county, city])
    
    except Exception as e:
        print(f"Error fetching details for {query}: {e}")
    
    return pd.Series(["N/A", "N/A", "N/A", "N/A", "N/A"])


def fetch_missing_county_names(df, gmaps, state):
    """Fill in missing or 'N/A' county names using Google Maps API."""

    def get_county_only(row):
        if pd.notna(row['county_name']) and row['county_name'].strip().upper() != "N/A":
            return row['county_name'].strip()  # Already exists and is valid

        if pd.isna(row["latitude"]) or pd.isna(row["longitude"]):
            return None

        try:
            result = gmaps.reverse_geocode((row["latitude"], row["longitude"]))
            for component in result[0].get("address_components", []):
                if "administrative_area_level_2" in component["types"]:
                    return component["long_name"].strip()
        except Exception as e:
            print(f"Error for index {row.name}: {e}")
            return None

    df["county_name"] = df.apply(get_county_only, axis=1)
    return df


    df["county_name"] = df.apply(get_county_only, axis=1)
    return df


def get_census_tract(lat, lon):
    """Fetch the census tract for given latitude and longitude using the Census Geocoder API."""
    # url = f"https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={lon}&y={lat}&benchmark=Public_AR_2020&vintage=Census2020_Census2020&layers=10&format=json"
    url = f"https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={lon}&y={lat}&benchmark=Public_AR_Current&vintage=Current_Current&layers=10&format=json"


    try:
        response = requests.get(url, timeout=5)  # Set timeout to avoid hanging requests
        response.raise_for_status()  # Raise an error for HTTP issues
        data = response.json()
        
        geographies = data.get('result', {}).get('geographies', {})
        if 'Census Block Groups' in geographies:
            tract_id = str(geographies['Census Block Groups'][0].get('TRACT', 'Not found'))
            return tract_id.zfill(6)  # Ensure 6-digit format
        return 'No data'
    
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"



def add_missing_census_tracts(df, max_workers=5):
    """Only fetch census tracts for rows where Tract == '000000'."""
    
    # Filter rows with bad tract data
    mask = df['Tract'] == "000000"
    df_missing = df[mask].copy()

    if df_missing.empty:
        print("No missing tracts to update.")
        return df
    
    print(f"Updating {len(df_missing)} missing tracts...")

    # Fetch the good tracts
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        new_tracts = list(executor.map(get_census_tract, df_missing['latitude'], df_missing['longitude']))

    # Update only those rows in the original dataframe
    df.loc[mask, 'Tract'] = new_tracts

    return df



def batch_census_tract(indf):

    #Takes in data frame, indf which has column heads, among others, "Address", "City", "State", and "Zip" and batch looks up census tract info.

    indf["Unique ID"] = indf.index

    df2=indf[["address","city_name","state_name","zip_code","Unique ID"]] #grab just relevant columns

    # Remove rows where 'Address' is NaN or an empty string
    df2 = df2[df2['address'].notna() & (df2['address'] != '')]
    df2 =  df2[df2['city_name'].notna() & (df2['city_name'] != '')]
    df2.columns = ["Street Address","City","State","Zip","Unique ID"]
    df2 = df2[["Unique ID"] + [col for col in df2.columns if col != "unique ID"]] #Unique ID needs to be first column

    csv_buffer = io.StringIO()
    df2.to_csv(csv_buffer, index=False, header=False, quoting=1)  # quoting=1 forces proper quoting
    csv_buffer.seek(0)  # Move to the start of the buffer

    # API Endpoint
    url = "https://geocoding.geo.census.gov/geocoder/geographies/addressbatch"
    # url = "https://geocoding.geo.census.gov/geocoder/geographies/addressbatch?benchmark=Public_AR_2020&vintage=Census2020_Census2020"

    # API Request with in-memory file
    files = {"addressFile": ("addresses.csv", csv_buffer.getvalue())}
    data = {"benchmark": "4", "vintage": "4"}

    response = requests.post(url, files=files, data=data)

    # Read response into a new DataFrame (FIXED!)
    result_buffer = io.StringIO(response.text)

    result_df = pd.read_csv(
    result_buffer,
    header=None,
    quotechar='"',  # This ensures that quoted fields (like addresses and lat/lon) stay intact
    names=[
        "Unique ID", "Street Address", "Match Status", "Match Type", 
        "Standardized Address", "Coordinates", "TigerLine ID", "Side", 
        "State FIPS", "County FIPS", "Tract", "Block"
    ]
    )

    # Split Latitude and Longitude.
    result_df[['Longitude', 'Latitude']] = result_df['Coordinates'].str.split(',', expand=True)
    result_df.drop(columns=['Coordinates'], inplace=True)  # Drop combined colum

    result_df["Tract"] = result_df["Tract"].fillna(0).astype(int)  # Fill NaNs with 0 and then convert
    result_df.loc[result_df["Tract"]==0, "Tract"] = "" 

    #Fill in updated county code when possible
    result_df = result_df.set_index("Unique ID")
    indf["county_code"] = indf["county_code"].fillna(result_df["County FIPS"])
    result_df = result_df.reset_index()

    #Drop Unnec. Columns
    result_df = result_df.drop(columns = ["Match Status", "Match Type", "Standardized Address", "Side","State FIPS","TigerLine ID","Block","County FIPS","Latitude","Longitude"])

    #Merge Results
    indf = indf.merge(result_df,on="Unique ID", how = "left")
    indf = indf.drop(columns = ["Unique ID"])
    indf["Tract"] = indf["Tract"].fillna("")

    #Pad Tract ID to 6 digit format
    indf["Tract"] = indf["Tract"].astype(str).str.zfill(6)

    #return modified input df
    return indf






In [3]:
state = "Arkansas"  # State
input_ed_csv = "/home/mark/Documents/Erdos/Proj/math_ed_project/data/AR23/AR23RawDemographics.xlsx" # Input education data file
input_census_csv = "/home/mark/Documents/Erdos/Proj/math_ed_project/data/census_data/AR/23/AR23_census_data.csv" #Input census data file
input_county_csv = "~/Documents/Erdos/Proj/math_ed_project/data/county_codes.csv" #Path to County Codes
output_csv = "/home/mark/Documents/Erdos/Proj/math_ed_project/data/AR23/AR23_Combined_Census_Education.csv"  # Output file


def build_csv(input_ed_csv,input_census_csv,input_county_csv,state,output_csv):
    #Takes input_ed_csv, input_census_csv, and input_county_csv, and combines into single csv.
    # *_csv arguments are all filepaths to csv file.
    # state is a string specifying which state this applies to.
   

    #Import and format county code data
    countydf=pd.read_csv(input_county_csv)
    countydf["state_name"] = countydf["state_name"].str.strip()
    countydf["county_name"] = countydf["county_name"].str.strip()
    countydf["county_code"] = countydf["county_code"].astype(str)

    # Read ed_CSV into Pandas DataFrame
    ext = input_ed_csv.split('.')[-1] #Get the file type, since they're not all csv files.
    if ext == "csv":
        df = pd.read_csv(input_ed_csv)
    else:
        df = df = pd.read_excel(input_ed_csv)

    #Can't work with entries with no school name
    df = df.dropna(subset=["School Name"])


    #Read census_CSV into data frame. Make unique ID a string, which ends up being convenient.
    dfcensus = pd.read_csv(input_census_csv)
    dfcensus["unique_id"]=dfcensus["unique_id"].astype(str)

    #Fetch Geo data using google maps API
  # Apply the function to get location details and add the resulting columns to the DataFrame
    df["state_name"] = state
    df[["latitude", "longitude", "address", "zip_code", "county_name", "city_name"]] = df.apply(get_location_details_google, axis=1)

    #For whatever reason google doesn't succeed at grabbing all the county names at first pass.
    fetch_missing_county_names(df,gmaps,state)
    
    #Load County ID data into df
    df = df.merge(countydf, on=["state_name", "county_name"], how="left")


    #First do batch census geoprocessing wherever possible
    df = batch_census_tract(df)


    #only then do individual census geoprocessing, on the stragglers
    df = add_missing_census_tracts(df)

      
    df["unique_id"] = df["county_code"].astype(str) + df["Tract"].astype(str)


    df = df.merge(dfcensus, on="unique_id", how="left")


    return df



df = build_csv(input_ed_csv,input_census_csv,input_county_csv,state,output_csv)
df.head()

  result_df.loc[result_df["Tract"]==0, "Tract"] = ""


Updating 123 missing tracts...


Unnamed: 0.1,Unnamed: 0,School Name,Grades,District Name,Math: % Met Readiness Benchmark,English: % Met Readiness Benchmark,Reading: % Met Readiness Benchmark,Science: % Met Readiness Benchmark,% Met Readiness Benchmark in all Four Subjects,School Enrollment,...,American Indian and Alaska Native alone (estimate),American Indian and Alaska Native alone (percentage),Asian alone (estimate),Asian alone (percentage),Native Hawaiian and Other Pacific Islander alone (estimate),Native Hawaiian and Other Pacific Islander alone (percentage),Some other race alone (estimate),Some other race alone (percentage),Two or more races alone (estimate),Two or more races alone (percentage)
0,1,Academic Center For Excellence,5-12,Cabot School District,0.088,0.275,0.213,0.125,0.05,350,...,5.0,0.1,148.0,1.8,45.0,0.5,0.0,0.0,199.0,2.4
1,2,Academies At Rivercrest High School,9-12,Rivercrest School District,0.139,0.389,0.236,0.125,0.056,346,...,0.0,0.0,6.0,0.3,3.0,0.1,0.0,0.0,81.0,3.7
2,3,Acorn High School,7-12,Ouachita River School District,0.021,0.458,0.313,0.208,0.021,267,...,28.0,0.9,0.0,0.0,0.0,0.0,0.0,0.0,73.0,2.4
3,4,Adventure Online Academy,K-12,Waldron School District,N<10,N<10,N<10,N<10,N<10,33,...,11.0,0.3,78.0,2.0,0.0,0.0,0.0,0.0,294.0,7.6
4,5,Agee Lierly Life Preparation Services School,9-12,Fayetteville School District,0.043,0.277,0.191,0.106,0.021,254,...,18.0,0.2,299.0,3.8,0.0,0.0,0.0,0.0,524.0,6.7


In [4]:
#Finally, check that there weren't too many errors in the merging. If not, write out to csv.
if (df[df["American Indian and Alaska Native alone (estimate)"].isna()].shape)[0] <10:
    df = df.dropna(subset = ["American Indian and Alaska Native alone (estimate)"])
    df.to_csv(output_csv, index=False)
else:
    print("false")
