In [121]:
import requests
import logging
import pandas as pd
import os
from datetime import datetime
import send2trash

# Configure logging
logging.basicConfig(level=logging.DEBUG,format='%(asctime)s - %(levelname)s - %(message)s', filename='logs.log', filemode='w' )
logger = logging.getLogger(__name__)


def rm_file(filename):
    path = os.path.expanduser(os.path.join(filename))
    if os.path.exists(path):
        logger.info(f"Removing existing file: {path}")
        try:
            # Rename the file to a temporary name
            temp_path = os.path.splitext(path)[0] + "_temp" + os.path.splitext(path)[1]
            os.rename(path, temp_path)

            # Delete the renamed file
            send2trash.send2trash(temp_path)
        except Exception as e:
            logger.error(f"Error removing file {path}: {e}")
    else:
        logger.warning(f"{path} does not exist")
# check if data is available
def get_available_data_years():
    current_year = datetime.now().year
    year = current_year
    available_years = []
    while year >= current_year - 4:  
        response = requests.get(f"https://api.census.gov/data/{year}/acs/acs5?get=B01001_001E,B19013_001E&for=state:*&key={api_key}")
        if response.status_code == 200:
            available_years.append(year)  
        year -= 1  
    return available_years  

# Function to create sub-batches
def create_sub_batches(group, max_batch_size):
    for start in range(0, len(group), max_batch_size):
        end = start + max_batch_size
        yield group.iloc[start:end]

def estimate_income(input_csv, output_csv, percentage_increase):
    # Load the existing data
    data_df = pd.read_csv(input_csv)

    # Determine the latest year in the data
    latest_year = data_df['year'].max()
    current_year = datetime.now().year

    # Create a DataFrame to store the estimated data
    estimated_df = pd.DataFrame()

    # Estimate income for each year from the latest year in the data to the current year
    for year in range(latest_year + 1, current_year + 1):
        temp_df = data_df[data_df['year'] == latest_year].copy()
        temp_df['MedianIncome'] *= (1 + percentage_increase) ** (year - latest_year)
        temp_df['year'] = year
        temp_df['_last_synced_'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        estimated_df = pd.concat([estimated_df, temp_df])
    # Append the estimated data to the output CSV file
    header = not (os.path.exists(output_csv) and os.path.getsize(output_csv) > 0)
    estimated_df[['MedianIncome', 'RegionName', 'year', 'state_fip',
       '_last_synced_']].to_csv(output_csv, mode='a', header=header, index=False)
    

In [124]:
def get_pop_income_data(years, zip_grouped):
    for year in years:
        # Process each group
        for state_fips, group in zip_grouped:
            # Create sub-batches for each group
            for sub_batch in create_sub_batches(group, max_batch_size):
                fib = sub_batch['statefips'].iloc[0]
                logger.debug(f"queryinig data for year: {year}, State Fib: {sub_batch['statefips'].iloc[0]}/{total_states}")
                # Check if the file exists and is not empty for each batch
                header = not (os.path.exists(output_pop_income) and os.path.getsize(output_pop_income) > 0)

                # Join all zip codes in the sub-batch into a single string
                zip_codes = ','.join(sub_batch['RegionName'].astype(str))

                url = f"https://api.census.gov/data/{year}/acs/acs5?get=B01001_001E,B19013_001E&for=zip%20code%20tabulation%20area:{zip_codes}&key={api_key}"

                # Make the API request
                response = requests.get(url)


                # Check if the request was successful
                if response.status_code == 200:
                    logger.info(f"data reterived for: {year}, State Fib: {fib}/{total_states}")

                    data = response.json()
                    df = pd.DataFrame(data[1:])
                    df.columns = ['Population', 'MedianIncome', 'RegionName']
                    df['year'] = year
                    df['state_fip'] = fib
                    df['_last_synced_'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    # Append to the CSV file
                    df.to_csv(output_pop_income, mode='a', header=header, index=False)
                    logger.info(f"Data for state {fib}, batch appended to CSV")
                else:
                    if "ambiguous geography" in response.text:
                        url = f"https://api.census.gov/data/{year}/acs/acs5?get=B01001_001E,B19013_001E&for=state:{fib}&for=zip%20code%20tabulation%20area:{zip_codes}&key={api_key}"
                        # Make the API request
                        response = requests.get(url)
                        if response.status_code == 200:
                            data = response.json()
                            df = pd.DataFrame(data[1:])
                            df.columns = ['Population', 'MedianIncome', 'RegionName']
                            df['year'] = year
                            df['state_fip'] = fib
                            df['_last_synced_'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                            # Append to the CSV file
                            df.to_csv(output_pop_income, mode='a', header=header, index=False)
                            logger.info(f"Data for state {fib}, batch appended to CSV")
                    logger.error(f"Failed to retrieve data for state {fib}, Status code: {response.status_code}, Response: {response.text}") 
                    
                
def main():
    logger.info("Starting process ...")
    rm_file(output_pop_income)
    rm_file(output_income_estimate)
    years = get_available_data_years()
    logger.info(f"fetching data for year: {years}")
    zips_df = pd.read_csv('regions.csv')[['RegionID', 'RegionName', 'statefips']]
    zip_grouped = zips_df.groupby('statefips')
    get_pop_income_data(years, zip_grouped)
    estimate_income(output_pop_income, output_income_estimate, percentage_increase)
    logger.info("Process compeleted")
    
                    

In [125]:
data_output_path = './census_data'
output_pop_income = f'{data_output_path}/population_income.csv'
output_income_estimate = f'{data_output_path}/income_estimate.csv'
percentage_increase = 0.03  # 3% annual increase
api_key = "a1af1e42e9841a9683efcc2d7c0f6f41dfd84755"
# api_key = "3fbd6a8fda31d606916c9cc0807b8cc8a07bac1e"

# Maximum number of zip codes per batch
max_batch_size = 500
main()