**PRELIMINARY NOTEBOOK SETUP**

In [1]:
# load necessary imports
%pip install google-cloud-bigquery-storage
from google.cloud import bigquery
from google.cloud import bigquery_storage
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os

# Load credentials (adjust path as needed)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/callummagarian/Desktop/Machine-Learning-Final/credentials/acs-dp05-03-analysis-43a26958b715.json"
load_dotenv()

Note: you may need to restart the kernel to use updated packages.


False

In [2]:
# define BigQuery client
client = bigquery.Client(project="acs-dp05-03-analysis")
# query the full ACS census tract table:
query = """
SELECT *
FROM `bigquery-public-data.census_bureau_acs.censustract_2020_5yr`
"""

# run query
query_job = client.query(query)

acs_df = query_job.result().to_dataframe(create_bqstorage_client=False)

print("Loaded ACS dataframe with shape:", acs_df.shape)
acs_df.head()

Loaded ACS dataframe with shape: (85395, 245)


Unnamed: 0,geo_id,aggregate_travel_time_to_work,amerindian_including_hispanic,amerindian_pop,armed_forces,asian_including_hispanic,asian_male_45_54,asian_male_55_64,asian_pop,associates_degree,...,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,walked_to_work,white_including_hispanic,white_male_45_54,white_male_55_64,white_pop,worked_at_home,workers_16_and_over
0,1001020200,18595.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,72.0,...,147.0,32.0,0.0,0.0,647.0,89.0,32.0,641.0,10.0,700.0
1,1001020300,47570.0,0.0,0.0,21.0,44.0,0.0,0.0,44.0,243.0,...,113.0,0.0,0.0,34.0,2363.0,143.0,232.0,2363.0,32.0,1959.0
2,1001020400,37660.0,10.0,10.0,10.0,17.0,0.0,0.0,17.0,257.0,...,105.0,23.0,25.0,28.0,3097.0,121.0,258.0,3085.0,45.0,1583.0
3,1001020503,28605.0,0.0,0.0,96.0,18.0,0.0,0.0,18.0,252.0,...,0.0,0.0,0.0,22.0,2397.0,218.0,105.0,2397.0,121.0,1606.0
4,1001020600,23255.0,0.0,0.0,40.0,11.0,0.0,0.0,11.0,145.0,...,134.0,43.0,22.0,6.0,2667.0,193.0,176.0,2551.0,80.0,1212.0


In [3]:
# DP05: Demographic features
dp05_cols = [
    "total_pop", "male_pop", "female_pop", "male_under_5", "male_5_to_9",
    "male_10_to_14", "male_15_to_17", "male_18_to_19", "male_20", "male_21",
    "male_22_to_24", "male_25_to_29", "male_30_to_34", "male_35_to_39",
    "male_40_to_44", "male_45_to_49", "male_50_to_54", "male_55_to_59",
    "male_60_to_61", "male_62_to_64", "male_65_to_66", "male_67_to_69",
    "male_70_to_74", "male_75_to_79", "male_80_to_84", "male_85_and_over",
    "female_under_5", "female_5_to_9", "female_10_to_14", "female_15_to_17",
    "female_18_to_19", "female_20", "female_21", "female_22_to_24", "female_25_to_29",
    "female_30_to_34", "female_35_to_39", "female_40_to_44", "female_45_to_49",
    "female_50_to_54", "female_55_to_59", "female_60_to_61", "female_62_to_64", 
    "female_65_to_66", "female_67_to_69", "female_70_to_74", "female_75_to_79",
    "female_80_to_84", "female_85_and_over", "white_pop", "black_pop", "asian_pop",
    "amerindian_pop", "other_race_pop", "two_or_more_races_pop", "hispanic_pop",
    "white_including_hispanic", "black_including_hispanic","asian_including_hispanic",
]
acs_dp05_df = acs_df[dp05_cols + ["geo_id"]]
print("DP05 dataframe shape:", acs_dp05_df.shape)
acs_dp05_df.head()

DP05 dataframe shape: (85395, 60)


Unnamed: 0,total_pop,male_pop,female_pop,male_under_5,male_5_to_9,male_10_to_14,male_15_to_17,male_18_to_19,male_20,male_21,...,black_pop,asian_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_pop,white_including_hispanic,black_including_hispanic,asian_including_hispanic,geo_id
0,1757.0,1010.0,747.0,81.0,77.0,22.0,52.0,16.0,0.0,5.0,...,1026.0,0.0,0.0,14.0,46.0,30.0,647.0,1026.0,0.0,1001020200
1,3694.0,1839.0,1855.0,58.0,134.0,227.0,74.0,25.0,24.0,29.0,...,1042.0,44.0,0.0,0.0,65.0,180.0,2363.0,1042.0,44.0,1001020300
2,3539.0,1794.0,1745.0,41.0,80.0,96.0,19.0,14.0,48.0,7.0,...,309.0,17.0,10.0,0.0,101.0,17.0,3097.0,309.0,17.0,1001020400
3,3268.0,1612.0,1656.0,39.0,89.0,123.0,129.0,8.0,0.0,12.0,...,659.0,18.0,0.0,0.0,194.0,0.0,2397.0,659.0,18.0,1001020503
4,3536.0,1591.0,1945.0,112.0,206.0,82.0,54.0,40.0,0.0,0.0,...,716.0,11.0,0.0,25.0,47.0,186.0,2667.0,716.0,11.0,1001020600


**CLEAN DATA**

In [None]:
# dimension and shape
acs_dp05_df.shape
acs_dp05_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85395 entries, 0 to 85394
Data columns (total 60 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   total_pop                 85395 non-null  float64
 1   male_pop                  85395 non-null  float64
 2   female_pop                85395 non-null  float64
 3   male_under_5              85395 non-null  float64
 4   male_5_to_9               69440 non-null  float64
 5   male_10_to_14             85395 non-null  float64
 6   male_15_to_17             85395 non-null  float64
 7   male_18_to_19             85395 non-null  float64
 8   male_20                   85395 non-null  float64
 9   male_21                   85395 non-null  float64
 10  male_22_to_24             85395 non-null  float64
 11  male_25_to_29             85395 non-null  float64
 12  male_30_to_34             85395 non-null  float64
 13  male_35_to_39             85395 non-null  float64
 14  male_4

In [5]:
# handlingsentinel codes
sentinel_values = [-666666666, -777777777, -888888888, -999999999]
acs_dp05_df.replace(sentinel_values, pd.NA, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acs_dp05_df.replace(sentinel_values, pd.NA, inplace=True)


In [None]:
# examine data types
acs_dp05_df.dtypes

total_pop                   float64
male_pop                    float64
female_pop                  float64
male_under_5                float64
male_5_to_9                 float64
male_10_to_14               float64
male_15_to_17               float64
male_18_to_19               float64
male_20                     float64
male_21                     float64
male_22_to_24               float64
male_25_to_29               float64
male_30_to_34               float64
male_35_to_39               float64
male_40_to_44               float64
male_45_to_49               float64
male_50_to_54               float64
male_55_to_59               float64
male_60_to_61               float64
male_62_to_64               float64
male_65_to_66               float64
male_67_to_69               float64
male_70_to_74               float64
male_75_to_79               float64
male_80_to_84               float64
male_85_and_over            float64
female_under_5              float64
female_5_to_9               

In [7]:
# check for missing values
acs_dp05_df.isnull().sum()

total_pop                       0
male_pop                        0
female_pop                      0
male_under_5                    0
male_5_to_9                 15955
male_10_to_14                   0
male_15_to_17                   0
male_18_to_19                   0
male_20                         0
male_21                         0
male_22_to_24                   0
male_25_to_29                   0
male_30_to_34                   0
male_35_to_39                   0
male_40_to_44                   0
male_45_to_49                   0
male_50_to_54               29520
male_55_to_59                   0
male_60_to_61                   0
male_62_to_64                   0
male_65_to_66                   0
male_67_to_69               40695
male_70_to_74                   0
male_75_to_79                   0
male_80_to_84                   0
male_85_and_over                0
female_under_5                  0
female_5_to_9                   0
female_10_to_14                 0
female_15_to_1