In [1]:
import pandas as pd
import requests
import time
import googlemaps
import concurrent.futures
import io


In [5]:
# Set up Google Maps API client
def load_api_key(filepath):
    """Reads API key from a text file."""
    with open(filepath, "r") as file:
        return file.read().strip()

API_KEY = load_api_key("Google_API_Key.txt")  #Replace with your API key (not provided in git repo)
gmaps = googlemaps.Client(key=API_KEY)

def get_location_details_google(row):
    """Fetch latitude, longitude, full street address, ZIP code, and county from Google Maps API."""
    
    if "District Name" in row:
        query = f"{row['School Name']}, {row['District Name']}, {state}, High School"
    else:
        query = f"{row['School Name']}, {state}, High School"

    
    try:
        geocode_result = gmaps.geocode(query)
        if geocode_result:
            location = geocode_result[0]["geometry"]["location"]
            lat, lon = location["lat"], location["lng"]
            
            # Extract address components
            address_components = geocode_result[0].get("address_components", [])
            street_number, street_name, zip_code, county = "", "", "N/A", "N/A"
            
            for component in address_components:
                types = component["types"]
                if "street_number" in types:
                    street_number = component["long_name"]
                if "route" in types:
                    street_name = component["long_name"]
                if "postal_code" in types:
                    zip_code = component["long_name"]
                if "administrative_area_level_2" in types:
                    county = component["long_name"]
                
            full_street_address = f"{street_number} {street_name}".strip()
            return pd.Series([lat, lon, full_street_address, zip_code, county])
    
    except Exception as e:
        print(f"Error fetching details for {query}: {e}")
    
    return pd.Series(["N/A", "N/A", "N/A", "N/A", "N/A"])



def get_census_tract(lat, lon):
    """Fetch the census tract for given latitude and longitude using the Census Geocoder API."""
    url = f"https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={lon}&y={lat}&benchmark=Public_AR_Current&vintage=Current_Current&layers=10&format=json"

    try:
        response = requests.get(url, timeout=5)  # Set timeout to avoid hanging requests
        response.raise_for_status()  # Raise an error for HTTP issues
        data = response.json()
        
        geographies = data.get('result', {}).get('geographies', {})
        if 'Census Block Groups' in geographies:
            tract_id = str(geographies['Census Block Groups'][0].get('TRACT', 'Not found'))
            return tract_id.zfill(6)  # Ensure 6-digit format
        return 'No data'
    
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"

def add_census_tracts(df, max_workers=5):
    """Use multithreading to fetch census tracts in parallel."""
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        df['Census_Tract'] = list(executor.map(get_census_tract, df['Latitude'], df['Longitude']))
    return df





In [7]:
def build_csv(input_ed_csv,input_census_csv,input_county_csv,state,output_csv):
    #Takes input_ed_csv, input_census_csv, and input_county_csv, and combines into single csv.
    # *_csv arguments are all filepaths to csv file.
    # state is a string specifying which state this applies to.
   

    #Import and format county code data
    countydf=pd.read_csv(input_county_csv)
    countydf["state_name"] = countydf["state_name"].str.strip()
    countydf["county_name"] = countydf["county_name"].str.strip()
    countydf["county_code"]=countydf["county_code"].astype(str)

    # Read ed_CSV into Pandas DataFrame
    ext = input_ed_csv.split('.')[-1] #Get the file type, since they're not all csv files.
    if ext == "csv":
        df = pd.read_csv(input_ed_csv)
    else:
        df = df = pd.read_excel(input_ed_csv,nrows=100)

    #Can't work with entries with no school name
    df = df.dropna(subset=["School Name"])


    #Read census_CSV into data frame. Make unique ID a string, which ends up being convenient.
    df2 = pd.read_csv(input_census_csv)
    df2["unique_id"]=df2["unique_id"].astype(str)

    #Fetch Geo data using google maps API
    df[["Latitude", "Longitude", "Address", "ZIP Code", "county_name"]] = df.apply(get_location_details_google, axis=1)
    df["state_name"] = state

    #Load County ID data into df
    df = df.merge(countydf, on=["state_name", "county_name"], how="left")

        # #Fetch Census tract data, add to df. This is the slow step
        # add_census_tracts(df)

        #Make unique 11 digit census tract code
        # df["unique_id"] = df["county_code"] + df["Census_Tract"]


        # df = df.merge(df2, on="unique_id", how="left")

        # df.to_csv(output_csv, index = False)

    return df

input_ed_csv = "/home/mark/Documents/Erdos/Proj/math_ed_project/data/IL22/IL22OrganizedData.xlsx" # Input education data file
input_census_csv = "/home/mark/Documents/Erdos/Proj/math_ed_project/data/census_data/IL/22/IL22_census_data.csv" #Input census data file
input_county_csv = "~/Documents/Erdos/Proj/math_ed_project/data/county_codes.csv"
output_csv = "/home/mark/Documents/Erdos/Proj/math_ed_project/data/IL22/IL22_Combined_Census_Education.csv"  # Output file
state = "Illinois"  # State

df = build_csv(input_ed_csv,input_census_csv,input_county_csv,state,output_csv)
df.head()

Unnamed: 0,School Name,District Name,City,County,District Type,District Size,School Type,Grades Served,SAT Reading Average Score,SAT Math Average Score,...,SAT Reading Total Male Students Level 4 %,Tests Taken,# Students SAT ELA Participation,Latitude,Longitude,Address,ZIP Code,county_name,state_name,county_code
0,Seymour High School,Payson CUSD 1,Payson,Adams,UNIT,MEDIUM,HIGH SCHOOL,7 - 12,467.7,445.8,...,0.0,31.0,31.0,39.816788,-91.250344,420 West Brainard Street,62360,Adams County,Illinois,17001
1,Seymour Elementary School,Payson CUSD 1,Payson,Adams,UNIT,MEDIUM,ELEMENTARY,PK - 6,,,...,,,,39.818122,-91.248102,404 West State Street,62360,Adams County,Illinois,17001
2,Liberty High School,Liberty CUSD 2,Liberty,Adams,UNIT,MEDIUM,HIGH SCHOOL,7 - 12,474.2,475.7,...,0.0,42.0,43.0,39.886186,-91.107069,505 North Park Street,62347,Adams County,Illinois,17001
3,Liberty Elementary School,Liberty CUSD 2,Liberty,Adams,UNIT,MEDIUM,ELEMENTARY,PK - 6,,,...,,,,39.886049,-91.108149,505 North Park Street,62347,Adams County,Illinois,17001
4,Central High School,Central CUSD 3,Camp Point,Adams,UNIT,MEDIUM,HIGH SCHOOL,9 - 12,474.9,468.6,...,0.0,59.0,59.0,40.063461,-91.028689,2110 State Highway 94,62320,Adams County,Illinois,17001


UnboundLocalError: cannot access local variable 'state' where it is not associated with a value

In [None]:
def batch_census_tract(indf):

    #Takes in data frame, indf which has column heads, among others, "Address", "City", "State", and "Zip" and batch looks up census tract info.

    df2 = indf[["Address", "City","State","Zip"]] #grab just relevant columns

    # Remove rows where 'Address' is NaN or an empty string
    df2 = df2[df2['Address'].notna() & (df2['Address'] != '')]


    # Convert DataFrame to a CSV format in memory
    csv_buffer = io.StringIO()
    indf.to_csv(csv_buffer, index=False, header=False, quoting=1)  # quoting=1 forces proper quoting
    csv_buffer.seek(0)  # Move to the start of the buffer

    # API Endpoint
    url = "https://geocoding.geo.census.gov/geocoder/geographies/addressbatch"

    # API Request with in-memory file
    files = {"addressFile": ("addresses.csv", csv_buffer.getvalue())}
    data = {"benchmark": "4", "vintage": "4"}

    response = requests.post(url, files=files, data=data)

    # Read response into a new DataFrame (FIXED!)
    result_buffer = io.StringIO(response.text)

    result_df = pd.read_csv(
        result_buffer,
        header=None,
        quotechar='"',  # ✅ This ensures that quoted fields (like addresses and lat/lon) stay intact
        names=[
            "Unique ID", "Address", "Match Status", "Match Type", 
            "Standardized Address", "Coordinates", "TigerLine ID", "Side", 
            "State FIPS", "County FIPS", "Tract", "Block"
        ]
    )

    # Split Latitude and Longitude
    result_df[['Longitude', 'Latitude']] = result_df['Coordinates'].str.split(',', expand=True)
    result_df.drop(columns=['Coordinates'], inplace=True)  # Drop combined column

    return result_df

In [28]:
df.keys()


Index(['School Name', 'District Name', 'City', 'County', 'District Type',
       'District Size', 'School Type', 'Grades Served',
       'SAT Reading Average Score', 'SAT Math Average Score',
       'SAT Reading Total Students Level 1 %',
       'SAT Reading Total Students Level 2 %',
       'SAT Reading Total Students Level 3 %',
       'SAT Reading Total Students Level 4 %',
       'SAT Math Total Students Level 1 %',
       'SAT Math Total Students Level 2 %',
       'SAT Math Total Students Level 3 %',
       'SAT Math Total Students Level 4 %',
       'SAT Reading Total Male Students Level 1 %',
       'SAT Reading Total Male Students Level 2 %',
       'SAT Reading Total Male Students Level 3 %',
       'SAT Reading Total Male Students Level 4 %', 'Tests Taken',
       '# Students SAT ELA Participation', 'Latitude', 'Longitude', 'Address',
       'ZIP Code', 'county_name_x', 'state_name_x', 'county_code',
       'Census_Tract', 'unique_id', 'year', 'tract_number', 'county_name_y'

In [20]:
df.head()

Unnamed: 0,Unique ID,Street Address,City,State,ZIP
0,1,1600 Pennsylvania Ave NW,Washington,DC,20500
1,2,350 5th Ave,New York,NY,10118
2,3,1 Infinite Loop,Cupertino,CA,95014
3,4,221B Baker St,Los Angeles,CA,90068
4,5,600 Montgomery St,San Francisco,CA,94111


0    1600 Pennsylvania Ave NW
1                 350 5th Ave
2             1 Infinite Loop
3               221B Baker St
4           600 Montgomery St
5             233 S Wacker Dr
6        500 S Buena Vista St
7         7000 Hollywood Blvd
8              4059 Mt Lee Dr
9                1575 G St NW
Name: Street Address, dtype: object