In [91]:
import pandas as pd
import requests
import time
import googlemaps
import concurrent.futures


In [None]:
# Set up Google Maps API client
def load_api_key(filepath):
    """Reads API key from a text file."""
    with open(filepath, "r") as file:
        return file.read().strip()

API_KEY = load_api_key("Google_API_Key.txt")  #Replace with your API key (not provided in git repo)
gmaps = googlemaps.Client(key=API_KEY)

def get_location_details_google(row):
    """Fetch latitude, longitude, full street address, ZIP code, and county from Google Maps API."""
    
    if "District Name" in row:
        query = f"{row['School Name']}, {row['District Name']}, {state}, High School"
    else:
        query = f"{row['School Name']}, {state}, High School"

    
    try:
        geocode_result = gmaps.geocode(query)
        if geocode_result:
            location = geocode_result[0]["geometry"]["location"]
            lat, lon = location["lat"], location["lng"]
            
            # Extract address components
            address_components = geocode_result[0].get("address_components", [])
            street_number, street_name, zip_code, county = "", "", "N/A", "N/A"
            
            for component in address_components:
                types = component["types"]
                if "street_number" in types:
                    street_number = component["long_name"]
                if "route" in types:
                    street_name = component["long_name"]
                if "postal_code" in types:
                    zip_code = component["long_name"]
                if "administrative_area_level_2" in types:
                    county = component["long_name"]
                    
            full_street_address = f"{street_number} {street_name}".strip()
            return pd.Series([lat, lon, full_street_address, zip_code, county])
    
    except Exception as e:
        print(f"Error fetching details for {query}: {e}")
    
    return pd.Series(["N/A", "N/A", "N/A", "N/A", "N/A"])



def get_census_tract(lat, lon):
    """Fetch the census tract for given latitude and longitude using the Census Geocoder API."""
    url = f"https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={lon}&y={lat}&benchmark=Public_AR_Current&vintage=Current_Current&layers=10&format=json"

    try:
        response = requests.get(url, timeout=5)  # Set timeout to avoid hanging requests
        response.raise_for_status()  # Raise an error for HTTP issues
        data = response.json()
        
        geographies = data.get('result', {}).get('geographies', {})
        if 'Census Block Groups' in geographies:
            tract_id = str(geographies['Census Block Groups'][0].get('TRACT', 'Not found'))
            return tract_id.zfill(6)  # Ensure 6-digit format
        return 'No data'
    
    except requests.exceptions.RequestException as e:
        return f"Error: {e}"

def add_census_tracts(df, max_workers=5):
    """Use multithreading to fetch census tracts in parallel."""
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        df['Census_Tract'] = list(executor.map(get_census_tract, df['Latitude'], df['Longitude']))
    return df





In [108]:
def build_csv(input_ed_csv,input_census_csv,input_county_csv,state,output_csv):
    #Takes input_ed_csv, input_census_csv, and input_county_csv, and combines into single csv.
    # *_csv arguments are all filepaths to csv file.
    # state is a string specifying which state this applies to.
   

    #Import and format county code data
    countydf=pd.read_csv(input_county_csv)
    countydf["state_name"] = countydf["state_name"].str.strip()
    countydf["county_name"] = countydf["county_name"].str.strip()
    countydf["county_code"]=countydf["county_code"].astype(str)

    # Read ed_CSV into Pandas DataFrame
    ext = input_ed_csv.split('.')[-1] #Get the file type, since they're not all csv files.
    if ext == "csv":
        df = pd.read_csv(input_ed_csv)
    else:
        df = df = pd.read_excel(input_ed_csv,nrows=100)

    #Can't work with entries with no school name
    df = df.dropna(subset=["School Name"])


    #Read census_CSV into data frame. Make unique ID a string, which ends up being convenient.
    df2 = pd.read_csv(input_census_csv)
    df2["unique_id"]=df2["unique_id"].astype(str)

    #Fetch Geo data using google maps API
    df[["Latitude", "Longitude", "Address", "ZIP Code", "county_name"]] = df.apply(get_location_details_google, axis=1)
    df["state_name"] = state

    #Load County ID data into df
    df = df.merge(countydf, on=["state_name", "county_name"], how="left")

    #Fetch Census tract data, add to df. This is the slow step
    add_census_tracts(df)

    #Make unique 11 digit census tract code
    df["unique_id"] = df["county_code"] + df["Census_Tract"]


    df = df.merge(df2, on="unique_id", how="left")

    df.to_csv(output_csv, index = False)

    return df

input_ed_csv = "/home/mark/Documents/Erdos/Proj/math_ed_project/data/IL22/IL22OrganizedData.xlsx" # Input education data file
input_census_csv = "/home/mark/Documents/Erdos/Proj/math_ed_project/data/census_data/IL/22/IL22_census_data.csv" #Input census data file
input_county_csv = "~/Documents/Erdos/Proj/math_ed_project/data/county_codes.csv"
output_csv = "/home/mark/Documents/Erdos/Proj/math_ed_project/data/IL22/IL22_Combined_Census_Education.csv"  # Output file
state = "Illinois"  # State

df = build_csv(input_ed_csv,input_census_csv,input_county_csv,state,output_csv)
df.head()

Unnamed: 0,School Name,District Name,City,County,District Type,District Size,School Type,Grades Served,SAT Reading Average Score,SAT Math Average Score,...,American Indian and Alaska Native alone (estimate),American Indian and Alaska Native alone (percentage),Asian alone (estimate),Asian alone (percentage),Native Hawaiian and Other Pacific Islander alone (estimate),Native Hawaiian and Other Pacific Islander alone (percentage),Some other race alone (estimate),Some other race alone (percentage),Two or more races alone (estimate),Two or more races alone (percentage)
0,Seymour High School,Payson CUSD 1,Payson,Adams,UNIT,MEDIUM,HIGH SCHOOL,7 - 12,467.7,445.8,...,0.0,0.0,28.0,0.9,0.0,0.0,0.0,0.0,41.0,1.3
1,Seymour Elementary School,Payson CUSD 1,Payson,Adams,UNIT,MEDIUM,ELEMENTARY,PK - 6,,,...,0.0,0.0,28.0,0.9,0.0,0.0,0.0,0.0,41.0,1.3
2,Liberty High School,Liberty CUSD 2,Liberty,Adams,UNIT,MEDIUM,HIGH SCHOOL,7 - 12,474.2,475.7,...,12.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.2
3,Liberty Elementary School,Liberty CUSD 2,Liberty,Adams,UNIT,MEDIUM,ELEMENTARY,PK - 6,,,...,12.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.2
4,Central High School,Central CUSD 3,Camp Point,Adams,UNIT,MEDIUM,HIGH SCHOOL,9 - 12,474.9,468.6,...,0.0,0.0,5.0,0.1,0.0,0.0,9.0,0.2,21.0,0.6


In [120]:
df["Address"]

0     420 West Brainard Street
1        404 West State Street
2        505 North Park Street
3        505 North Park Street
4        2110 State Highway 94
                ...           
72       510 South Main Street
73                            
74    204 North Perrine Street
75    204 North Perrine Street
76                            
Name: Address, Length: 77, dtype: object

In [None]:
df2 = pd.read_csv(input_census_csv)
df2["unique_id"]=df2["unique_id"].astype(str)


'5001480200'

In [45]:
"a" + "b"

'ab'

'5'