In [1]:
import pandas as pd
import pathlib
import os
import pandas as pd
import numpy as np
from unidecode import unidecode
import re

In [None]:
current_path = os.getcwd()
data_path = '../data/San_Francisco_Population_and_Demographic_Census_Data_20250729.csv'
full_path = os.path.join(current_path, data_path)

In [3]:
df = pd.read_csv(full_path)


In [4]:
print(f"Original shape: {df.shape}")
print(f"Original unique geographies: {df['geography_id'].nunique()}")

Original shape: (31127, 32)
Original unique geographies: 340


In [5]:
# age_df = df[df['pums_variables'] == 'AGEP'].copy()
age_df = df.copy()
# Also ensure estimate is available
age_df = age_df[age_df['estimate'].notna()].copy()

# Convert geography_id to string (remove .0)
age_df['geography_id'] = age_df['geography_id'].astype(str).str.replace('.0', '', regex=False)

# Create time_key
age_df['time_key'] = age_df['start_year'].astype(str) + '_' + age_df['end_year'].astype(str)

print(f"Rows after filtering to AGEP (age) and valid estimate: {len(age_df)}")
print(f"Geographies after filter: {age_df['geography_id'].nunique()}")


Rows after filtering to AGEP (age) and valid estimate: 31119
Geographies after filter: 341


In [6]:
age_df

Unnamed: 0,source,num_years,start_year,end_year,derived,derived_details,geography,geography_id_name,geography_id,geography_name,...,acs_code,acs_label,acs_concept,pums_variables,overall_segment,reporting_segment,row_identifier,data_as_of,data_loaded_at,time_key
0,PUMS_acs5,5,2014,2019,True,"from public use micro data, se calculated usin...",county,GEOID,6075,"San Francisco County, California",...,,,,AGEP,PUMS_acs5 population count data from 2014 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2014 to 2...,10/20/2022 02:24:33 PM,10/20/2022 02:28:00 PM,2014_2019
1,PUMS_acs5,5,2015,2020,True,"from public use micro data, se calculated usin...",county,GEOID,6075,"San Francisco County, California",...,,,,AGEP,PUMS_acs5 population count data from 2015 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2015 to 2...,10/20/2022 02:24:34 PM,10/20/2022 02:28:00 PM,2015_2020
2,PUMS_acs5,5,2014,2019,True,"from public use micro data, se calculated usin...",county,GEOID,6075,"San Francisco County, California",...,,,,AGEP,PUMS_acs5 population count data from 2014 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2014 to 2...,10/20/2022 02:24:35 PM,10/20/2022 02:28:00 PM,2014_2019
3,PUMS_acs5,5,2015,2020,True,"from public use micro data, se calculated usin...",county,GEOID,6075,"San Francisco County, California",...,,,,AGEP,PUMS_acs5 population count data from 2015 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2015 to 2...,10/20/2022 02:24:36 PM,10/20/2022 02:28:00 PM,2015_2020
4,acs5,5,2016,2021,False,from acs table,county,GEOID,6075,"San Francisco County, California",...,1.0,Estimate!!Total:,SEX BY AGE,,acs5 population count data from 2016 to 2021 a...,from census table B01001,acs5 population count data from 2016 to 2021 a...,03/06/2023 12:09:00 PM,03/06/2023 12:32:00 PM,2016_2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31122,PUMS_acs5,5,2019,2023,True,"from public use micro data, se calculated usin...",county,GEOID,6075,"San Francisco County, California",...,,,,AGEP,PUMS_acs5 population count data from 2019 to 2...,COVID-19 cases/testing reporting - age brackets,PUMS_acs5 population count data from 2019 to 2...,03/27/2025 11:25:13 AM,03/27/2025 11:36:37 AM,2019_2023
31123,PUMS_acs5,5,2019,2023,True,"from public use micro data, se calculated usin...",county,GEOID,6075,"San Francisco County, California",...,,,,AGEP,PUMS_acs5 population count data from 2019 to 2...,COVID-19 cases/testing reporting - age brackets,PUMS_acs5 population count data from 2019 to 2...,03/27/2025 11:25:14 AM,03/27/2025 11:36:37 AM,2019_2023
31124,PUMS_acs5,5,2019,2023,True,"from public use micro data, se calculated usin...",county,GEOID,6075,"San Francisco County, California",...,,,,AGEP,PUMS_acs5 population count data from 2019 to 2...,COVID-19 cases/testing reporting - age brackets,PUMS_acs5 population count data from 2019 to 2...,03/27/2025 11:25:14 AM,03/27/2025 11:36:37 AM,2019_2023
31125,PUMS_acs5,5,2019,2023,True,"from public use micro data, se calculated usin...",county,GEOID,6075,"San Francisco County, California",...,,,,AGEP,PUMS_acs5 population count data from 2019 to 2...,COVID-19 cases/testing reporting - age brackets,PUMS_acs5 population count data from 2019 to 2...,03/27/2025 11:25:14 AM,03/27/2025 11:36:37 AM,2019_2023


In [7]:
def clean_text(x):
    if pd.isna(x):
        return "Unknown"
    x = str(x).strip()
    x = unidecode(x)  # Remove accents
    x = re.sub(r'\s+', ' ', x)  # Normalize whitespace
    return x.lower()

# Apply to key fields
age_df['reporting_segment'] = age_df['reporting_segment'].apply(clean_text)
age_df['overall_segment'] = age_df['overall_segment'].apply(clean_text)
age_df['acs_concept'] = age_df['acs_concept'].apply(clean_text)

In [8]:
# 🔹 dim_geography
dim_geography = age_df[['geography_id', 'geography_name', 'geography']].drop_duplicates().reset_index(drop=True)
dim_geography = dim_geography.rename(columns={'geography': 'geography_type'})
dim_geography['geography_id_key'] = range(1, len(dim_geography) + 1)
dim_geography = dim_geography[[
    'geography_id_key', 'geography_id', 'geography_name', 'geography_type'
]]

In [9]:
# 🔹 dim_demographic
dim_demographic = age_df[[
    'demographic_category', 'demographic_category_label',
    'min_age', 'max_age', 'unit', 'pums_variables'
]].drop_duplicates().reset_index(drop=True)
dim_demographic['min_age'] = pd.to_numeric(dim_demographic['min_age'], errors='coerce')
dim_demographic['max_age'] = pd.to_numeric(dim_demographic['max_age'], errors='coerce')
dim_demographic['unit'] = dim_demographic['unit'].apply(clean_text)
dim_demographic['demographic_id'] = range(1, len(dim_demographic) + 1)

In [10]:
# 🔹 dim_time
dim_time = age_df[['start_year', 'end_year', 'num_years']].drop_duplicates().reset_index(drop=True)
dim_time['start_year'] = pd.to_numeric(dim_time['start_year'])
dim_time['end_year'] = pd.to_numeric(dim_time['end_year'])
dim_time['time_key'] = dim_time['start_year'].astype(str) + '_' + dim_time['end_year'].astype(str)
dim_time['time_id'] = range(1, len(dim_time) + 1)

In [11]:
# 🔹 dim_source
dim_source = age_df[[
    'source', 'derived', 'derived_details', 'acs_concept'
]].drop_duplicates().reset_index(drop=True)
dim_source = dim_source.rename(columns={'source': 'source_name'})
dim_source['source_id'] = range(1, len(dim_source) + 1)

In [12]:
# 🔹 dim_reporting_segment
dim_reporting = age_df[[
    'overall_segment', 'reporting_segment'
]].drop_duplicates().reset_index(drop=True)
dim_reporting['reporting_segment_id'] = range(1, len(dim_reporting) + 1)

In [13]:
# dim_demographic = dim_demographic.rename(columns={'pums_variable': 'pums_variables'})

In [14]:
dim_source['source_name']

0     PUMS_acs5
1          acs5
2          acs5
3          acs5
4          acs5
5          acs5
6          acs5
7          acs5
8          acs5
9          acs1
10         acs1
11         acs1
12    PUMS_acs1
Name: source_name, dtype: object

In [15]:
age_df = age_df.rename(columns={'source': 'source_name'})

In [16]:
age_df.columns

Index(['source_name', 'num_years', 'start_year', 'end_year', 'derived',
       'derived_details', 'geography', 'geography_id_name', 'geography_id',
       'geography_name', 'demographic_category', 'demographic_category_label',
       'min_age', 'max_age', 'unit', 'estimate', 'se', 'moe', 'cv', 'reliable',
       'upper_ci', 'lower_ci', 'acs_table', 'acs_code', 'acs_label',
       'acs_concept', 'pums_variables', 'overall_segment', 'reporting_segment',
       'row_identifier', 'data_as_of', 'data_loaded_at', 'time_key'],
      dtype='object')

In [17]:

# Start fresh
fact_table = age_df.copy()

# Drop any previous keys (to avoid _x/_y)
for col in ['geography_id_key', 'demographic_id', 'time_id', 'source_id', 'reporting_segment_id']:
    if col in fact_table.columns:
        fact_table.drop(columns=[col], inplace=True)

# Merge geography
fact_table = fact_table.merge(
    dim_geography[['geography_id', 'geography_id_key']],
    on='geography_id',
    how='left'
)

# Merge demographic
fact_table = fact_table.merge(
    dim_demographic[[
        'demographic_category', 'demographic_category_label',
        'min_age', 'max_age', 'unit', 'pums_variables', 'demographic_id'
    ]],
    on=['demographic_category', 'demographic_category_label', 'min_age', 'max_age', 'unit', 'pums_variables'],
    how='left'
)

# Merge time
fact_table = fact_table.merge(
    dim_time[['start_year', 'end_year', 'time_id']],
    on=['start_year', 'end_year'],
    how='left'
)

# Merge source
fact_table = fact_table.merge(
    dim_source[['source_name', 'derived', 'derived_details', 'acs_concept', 'source_id']],
    on=['source_name', 'derived', 'derived_details', 'acs_concept'],
    how='left'
)

# Merge reporting
fact_table = fact_table.merge(
    dim_reporting[['overall_segment', 'reporting_segment', 'reporting_segment_id']],
    on=['overall_segment', 'reporting_segment'],
    how='left'
)


In [18]:

# Select final columns
fact_table = fact_table[[
    'geography_id_key',
    'demographic_id',
    'time_id',
    'source_id',
    'reporting_segment_id',
    'estimate',
    'se',
    'moe',
    'cv',
    'reliable',
    'upper_ci',
    'lower_ci'
]].copy()

# Rename geography_id_key to geography_id for fact table
fact_table = fact_table.rename(columns={'geography_id_key': 'geography_id'})

# Drop rows with missing keys
fact_table.dropna(subset=['geography_id', 'demographic_id', 'time_id', 'source_id', 'reporting_segment_id'], inplace=True)

# Convert IDs to int
id_cols = ['geography_id', 'demographic_id', 'time_id', 'source_id', 'reporting_segment_id']
fact_table[id_cols] = fact_table[id_cols].astype(int)

# -------------------------------
# 7. Final Output
# -------------------------------

print("✅ Final Shapes:")
print("fact_table:", fact_table.shape)
print("dim_geography:", dim_geography.shape)
print("dim_demographic:", dim_demographic.shape)
print("dim_time:", dim_time.shape)
print("dim_source:", dim_source.shape)
print("dim_reporting:", dim_reporting.shape)

# Optional: Save to CSV
# fact_table.to_csv("fact_population_estimate.csv", index=False)
# dim_geography.to_csv("dim_geography.csv", index=False)
# dim_demographic.to_csv("dim_demographic.csv", index=False)
# dim_time.to_csv("dim_time.csv", index=False)
# dim_source.to_csv("dim_source.csv", index=False)
# dim_reporting.to_csv("dim_reporting_segment.csv", index=False)

print("✅ Star schema built successfully!")

✅ Final Shapes:
fact_table: (307922, 12)
dim_geography: (920, 4)
dim_demographic: (598, 7)
dim_time: (17, 5)
dim_source: (13, 5)
dim_reporting: (163, 3)
✅ Star schema built successfully!


In [19]:
fact_table

Unnamed: 0,geography_id,demographic_id,time_id,source_id,reporting_segment_id,estimate,se,moe,cv,reliable,upper_ci,lower_ci
0,1,1,1,1,1,757193.0,1148.231183,1888.840296,0.001516,True,759478.497000,754907.503000
1,1,1,2,1,2,757150.0,1552.237063,2553.429969,0.002050,True,760239.650589,754060.349411
2,1,2,1,1,1,292025.0,691.494866,1137.509055,0.002368,True,293401.386102,290648.613898
3,1,2,2,1,2,294363.0,1015.866945,1671.101125,0.003451,True,296385.032574,292340.967426
4,1,3,3,2,3,865933.0,0.000000,0.000000,0.000000,True,865933.000000,865933.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
307977,1,59,11,1,163,114527.0,1428.496692,2349.877059,0.012473,True,117370.351542,111683.648458
307978,1,60,11,1,163,107335.0,1083.991075,1783.165318,0.010099,True,109492.630262,105177.369738
307979,1,61,11,1,163,91619.0,1388.505257,2284.091149,0.015155,True,94382.750582,88855.249418
307980,1,62,11,1,163,62417.0,1253.747642,2062.414871,0.020087,True,64912.522258,59921.477742


In [20]:
print("✅ Final Shapes:")
print("fact_table:", fact_table.shape)
print("dim_geography:", dim_geography.shape)
print("dim_demographic:", dim_demographic.shape)
print("dim_time:", dim_time.shape)
print("dim_source:", dim_source.shape)
print("dim_reporting:", dim_reporting.shape)

✅ Final Shapes:
fact_table: (307922, 12)
dim_geography: (920, 4)
dim_demographic: (598, 7)
dim_time: (17, 5)
dim_source: (13, 5)
dim_reporting: (163, 3)


### ////


In [21]:

print(df.head())


      source  num_years  start_year  end_year  derived  \
0  PUMS_acs5          5        2014      2019     True   
1  PUMS_acs5          5        2015      2020     True   
2  PUMS_acs5          5        2014      2019     True   
3  PUMS_acs5          5        2015      2020     True   
4       acs5          5        2016      2021    False   

                                     derived_details geography  \
0  from public use micro data, se calculated usin...    county   
1  from public use micro data, se calculated usin...    county   
2  from public use micro data, se calculated usin...    county   
3  from public use micro data, se calculated usin...    county   
4                                     from acs table    county   

  geography_id_name  geography_id                    geography_name  ...  \
0             GEOID        6075.0  San Francisco County, California  ...   
1             GEOID        6075.0  San Francisco County, California  ...   
2             GEOID       

In [22]:

print(df.columns)

Index(['source', 'num_years', 'start_year', 'end_year', 'derived',
       'derived_details', 'geography', 'geography_id_name', 'geography_id',
       'geography_name', 'demographic_category', 'demographic_category_label',
       'min_age', 'max_age', 'unit', 'estimate', 'se', 'moe', 'cv', 'reliable',
       'upper_ci', 'lower_ci', 'acs_table', 'acs_code', 'acs_label',
       'acs_concept', 'pums_variables', 'overall_segment', 'reporting_segment',
       'row_identifier', 'data_as_of', 'data_loaded_at'],
      dtype='object')


In [23]:
df.shape

(31127, 32)

In [24]:
df[2:7].tail()

Unnamed: 0,source,num_years,start_year,end_year,derived,derived_details,geography,geography_id_name,geography_id,geography_name,...,acs_table,acs_code,acs_label,acs_concept,pums_variables,overall_segment,reporting_segment,row_identifier,data_as_of,data_loaded_at
2,PUMS_acs5,5,2014,2019,True,"from public use micro data, se calculated usin...",county,GEOID,6075.0,"San Francisco County, California",...,,,,,AGEP,PUMS_acs5 population count data from 2014 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2014 to 2...,10/20/2022 02:24:35 PM,10/20/2022 02:28:00 PM
3,PUMS_acs5,5,2015,2020,True,"from public use micro data, se calculated usin...",county,GEOID,6075.0,"San Francisco County, California",...,,,,,AGEP,PUMS_acs5 population count data from 2015 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2015 to 2...,10/20/2022 02:24:36 PM,10/20/2022 02:28:00 PM
4,acs5,5,2016,2021,False,from acs table,county,GEOID,6075.0,"San Francisco County, California",...,B01001,1.0,Estimate!!Total:,SEX BY AGE,,acs5 population count data from 2016 to 2021 a...,from census table B01001,acs5 population count data from 2016 to 2021 a...,03/06/2023 12:09:00 PM,03/06/2023 12:32:00 PM
5,acs5,5,2016,2021,False,from acs table,county,GEOID,6075.0,"San Francisco County, California",...,B03002,12.0,Estimate!!Total:!!Hispanic or Latino:,HISPANIC OR LATINO ORIGIN BY RACE,,acs5 population count data from 2016 to 2021 a...,from census table B03002,acs5 population count data from 2016 to 2021 a...,03/06/2023 12:09:00 PM,03/06/2023 12:32:00 PM
6,acs5,5,2016,2021,False,from acs table,county,GEOID,6075.0,"San Francisco County, California",...,B03002,3.0,Estimate!!Total:!!Not Hispanic or Latino:!!Whi...,HISPANIC OR LATINO ORIGIN BY RACE,,acs5 population count data from 2016 to 2021 a...,from census table B03002,acs5 population count data from 2016 to 2021 a...,03/06/2023 12:09:01 PM,03/06/2023 12:32:00 PM


In [25]:
null_columns = df.columns[df.isna().any()]
print(null_columns)

Index(['geography_id_name', 'geography_id', 'estimate', 'se', 'moe', 'cv',
       'upper_ci', 'lower_ci', 'acs_table', 'acs_code', 'acs_label',
       'acs_concept', 'pums_variables'],
      dtype='object')


In [26]:
df[null_columns].isna().sum()

geography_id_name     4402
geography_id          4402
estimate                 8
se                      12
moe                     12
cv                    3438
upper_ci                20
lower_ci                20
acs_table             1945
acs_code              3057
acs_label             3057
acs_concept           3045
pums_variables       29182
dtype: int64

### Data Cleaning and Exploration


In [27]:
clean_df = df[
    (df['geography_id'].notna()) & 
    (df['estimate'].notna())
].copy()

print(f"Rows after filtering to estimate rows: {len(clean_df)}")

# Optional: Also ensure it's PUMS_acs5 and county-level if needed
clean_df = clean_df[
    (clean_df['source'] == 'PUMS_acs5') &
    (clean_df['geography'] == 'county')
].copy()

Rows after filtering to estimate rows: 26717


In [28]:
print("\nNull counts after filtering:")
print(clean_df[[
    'estimate', 'se', 'moe', 'cv', 'upper_ci', 'lower_ci',
    'acs_table', 'acs_code', 'pums_variables'
]].isna().sum())


Null counts after filtering:
estimate             0
se                   0
moe                  0
cv                   0
upper_ci             0
lower_ci             0
acs_table         1325
acs_code          1325
pums_variables       0
dtype: int64


In [29]:
clean_df.columns

Index(['source', 'num_years', 'start_year', 'end_year', 'derived',
       'derived_details', 'geography', 'geography_id_name', 'geography_id',
       'geography_name', 'demographic_category', 'demographic_category_label',
       'min_age', 'max_age', 'unit', 'estimate', 'se', 'moe', 'cv', 'reliable',
       'upper_ci', 'lower_ci', 'acs_table', 'acs_code', 'acs_label',
       'acs_concept', 'pums_variables', 'overall_segment', 'reporting_segment',
       'row_identifier', 'data_as_of', 'data_loaded_at'],
      dtype='object')

In [30]:
clean_df.head()

Unnamed: 0,source,num_years,start_year,end_year,derived,derived_details,geography,geography_id_name,geography_id,geography_name,...,acs_table,acs_code,acs_label,acs_concept,pums_variables,overall_segment,reporting_segment,row_identifier,data_as_of,data_loaded_at
0,PUMS_acs5,5,2014,2019,True,"from public use micro data, se calculated usin...",county,GEOID,6075.0,"San Francisco County, California",...,,,,,AGEP,PUMS_acs5 population count data from 2014 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2014 to 2...,10/20/2022 02:24:33 PM,10/20/2022 02:28:00 PM
1,PUMS_acs5,5,2015,2020,True,"from public use micro data, se calculated usin...",county,GEOID,6075.0,"San Francisco County, California",...,,,,,AGEP,PUMS_acs5 population count data from 2015 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2015 to 2...,10/20/2022 02:24:34 PM,10/20/2022 02:28:00 PM
2,PUMS_acs5,5,2014,2019,True,"from public use micro data, se calculated usin...",county,GEOID,6075.0,"San Francisco County, California",...,,,,,AGEP,PUMS_acs5 population count data from 2014 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2014 to 2...,10/20/2022 02:24:35 PM,10/20/2022 02:28:00 PM
3,PUMS_acs5,5,2015,2020,True,"from public use micro data, se calculated usin...",county,GEOID,6075.0,"San Francisco County, California",...,,,,,AGEP,PUMS_acs5 population count data from 2015 to 2...,COVID-19 vaccine reporting - age groups,PUMS_acs5 population count data from 2015 to 2...,10/20/2022 02:24:36 PM,10/20/2022 02:28:00 PM
25,PUMS_acs5,5,2014,2019,True,"from public use micro data, se calculated usin...",county,GEOID,6075.0,"San Francisco County, California",...,,,,,"RAC1P, HISP, AGEP",PUMS_acs5 population count data from 2014 to 2...,COVID-19 vaccine reporting - primary age brack...,PUMS_acs5 population count data from 2014 to 2...,09/01/2022 12:22:00 PM,09/01/2022 12:52:00 PM


In [31]:
clean_df['geography_id'] = clean_df['geography_id'].astype(str).str.replace('.0', '', regex=False)

In [32]:
clean_df['age_bracket'] = clean_df.apply(
    lambda row: f"{int(row['min_age'])}_{int(row['max_age'])}" 
    if pd.notna(row['min_age']) and pd.notna(row['max_age']) 
    else "Unknown",
    axis=1
)

# Use this as key for demographic dimension
clean_df['demographic_category'] = clean_df['demographic_category'].fillna(clean_df['age_bracket'])

In [33]:
clean_df['time_key'] = clean_df['start_year'].astype(str) + "_" + clean_df['end_year'].astype(str)
# e.g., "2019_2023"

In [34]:
clean_df.columns

Index(['source', 'num_years', 'start_year', 'end_year', 'derived',
       'derived_details', 'geography', 'geography_id_name', 'geography_id',
       'geography_name', 'demographic_category', 'demographic_category_label',
       'min_age', 'max_age', 'unit', 'estimate', 'se', 'moe', 'cv', 'reliable',
       'upper_ci', 'lower_ci', 'acs_table', 'acs_code', 'acs_label',
       'acs_concept', 'pums_variables', 'overall_segment', 'reporting_segment',
       'row_identifier', 'data_as_of', 'data_loaded_at', 'age_bracket',
       'time_key'],
      dtype='object')

In [35]:
fact_columns = [
    'geography_id',
    'geography_name',
    'demographic_category',
    'demographic_category_label',
    'min_age',
    'max_age',
    'unit',
    'estimate',
    'se',
    'moe',
    'cv',
    'reliable',
    'upper_ci',
    'lower_ci',
    'acs_table',
    'acs_code',
    'acs_label',
    'acs_concept',
    'pums_variables',
    'source',
    'num_years',
    'start_year',
    'end_year',
    'time_key',
    'data_as_of',
    'data_loaded_at',
    'reporting_segment',
    'overall_segment'
]

clean_df = clean_df[fact_columns].reset_index(drop=True)

In [36]:
clean_df.to_csv("cleaned_acs_population_estimates.csv", index=False)
print("Cleaned data saved.")

Cleaned data saved.


In [37]:
clean_df.columns

Index(['geography_id', 'geography_name', 'demographic_category',
       'demographic_category_label', 'min_age', 'max_age', 'unit', 'estimate',
       'se', 'moe', 'cv', 'reliable', 'upper_ci', 'lower_ci', 'acs_table',
       'acs_code', 'acs_label', 'acs_concept', 'pums_variables', 'source',
       'num_years', 'start_year', 'end_year', 'time_key', 'data_as_of',
       'data_loaded_at', 'reporting_segment', 'overall_segment'],
      dtype='object')

### Build Fact & Dimension Tables


In [38]:
# 1. dim_geography
dim_geography = clean_df[['geography_id', 'geography_name',]].drop_duplicates()
dim_geography = dim_geography.rename(columns={'geography': 'geography_type'})
dim_geography['geography_id_key'] = range(1, len(dim_geography) + 1)
dim_geography = dim_geography[
    ['geography_id_key', 'geography_id', 'geography_name']
]

In [39]:
# dim_demographic
dim_demographic = clean_df[[
    'demographic_category', 'demographic_category_label',
    'min_age', 'max_age', 'unit', 'pums_variables'
]].drop_duplicates().reset_index(drop=True)
dim_demographic['demographic_id'] = range(1, len(dim_demographic) + 1)

In [40]:
# dim_time
dim_time = clean_df[[
    'start_year', 'end_year', 'num_years', 'time_key'
]].drop_duplicates().reset_index(drop=True)
dim_time['time_id'] = range(1, len(dim_time) + 1)

In [41]:
# dim_source
dim_source = clean_df[[
    'source', 'acs_concept',
    'data_as_of', 'data_loaded_at'
]].drop_duplicates().reset_index(drop=True)
dim_source['source_id'] = range(1, len(dim_source) + 1)

In [42]:
# dim_reporting_segment
dim_reporting = clean_df[[
    'overall_segment', 'reporting_segment'
]].drop_duplicates().reset_index(drop=True)
dim_reporting['reporting_segment_id'] = range(1, len(dim_reporting) + 1)

In [43]:
# Merge in keys
fact_table = clean_df.merge(dim_geography[['geography_id', 'geography_id_key']], on='geography_id') \
                     .merge(dim_demographic, on=[
                         'demographic_category', 'demographic_category_label',
                         'min_age', 'max_age', 'unit', 'pums_variables'
                     ]) \
                     .merge(dim_time[['time_key', 'time_id']], on='time_key') \
                     .merge(dim_source[['source', 'source_id']], on='source') \
                     .merge(dim_reporting[['reporting_segment', 'reporting_segment_id']], on='reporting_segment')

# Final fact table
fact_table = fact_table[[
    'geography_id_key',
    'demographic_id',
    'time_id',
    'source_id',
    'reporting_segment_id',
    'estimate',
    'se',
    'moe',
    'cv',
    'reliable',
    'upper_ci',
    'lower_ci'
]].copy()

fact_table.rename(columns={'geography_id_key': 'geography_id'}, inplace=True)

In [54]:
fact_table.isna().sum()

geography_id            0
demographic_id          0
time_id                 0
source_id               0
reporting_segment_id    0
estimate                0
se                      0
moe                     0
cv                      0
reliable                0
upper_ci                0
lower_ci                0
dtype: int64

In [45]:
dim_geography

Unnamed: 0,geography_id_key,geography_id,geography_name
0,1,6075,"San Francisco County, California"


In [46]:
dim_source

Unnamed: 0,source,acs_concept,data_as_of,data_loaded_at,source_id
0,PUMS_acs5,,10/20/2022 02:24:33 PM,10/20/2022 02:28:00 PM,1
1,PUMS_acs5,,10/20/2022 02:24:34 PM,10/20/2022 02:28:00 PM,2
2,PUMS_acs5,,10/20/2022 02:24:35 PM,10/20/2022 02:28:00 PM,3
3,PUMS_acs5,,10/20/2022 02:24:36 PM,10/20/2022 02:28:00 PM,4
4,PUMS_acs5,,09/01/2022 12:22:00 PM,09/01/2022 12:52:00 PM,5
...,...,...,...,...,...
205,PUMS_acs5,,03/27/2025 11:25:10 AM,03/27/2025 11:36:37 AM,206
206,PUMS_acs5,,03/27/2025 11:25:11 AM,03/27/2025 11:36:37 AM,207
207,PUMS_acs5,,03/27/2025 11:25:12 AM,03/27/2025 11:36:37 AM,208
208,PUMS_acs5,,03/27/2025 11:25:13 AM,03/27/2025 11:36:37 AM,209


In [47]:
dim_demographic

Unnamed: 0,demographic_category,demographic_category_label,min_age,max_age,unit,pums_variables,demographic_id
0,age,18 + years,18,120,population count,AGEP,1
1,age,50 + years,50,120,population count,AGEP,2
2,age / race,55 - 64 years & Native Hawaiian or Other Pacif...,55,64,population count,"RAC1P, HISP, AGEP",3
3,age / race,"55 - 64 years & Some other race alone, Not His...",55,64,population count,"RAC1P, HISP, AGEP",4
4,age / race,"55 - 64 years & White alone, Not Hispanic or L...",55,64,population count,"RAC1P, HISP, AGEP",5
...,...,...,...,...,...,...,...
540,age,60 + years & Hispanic or Latino:,60,120,population count,"RAC1P, HISP, AGEP",541
541,age,"60 + years & Multi-Racial, Not Hispanic or Lat...",60,120,population count,"RAC1P, HISP, AGEP",542
542,age,60 + years & Native Hawaiian or Other Pacific ...,60,120,population count,"RAC1P, HISP, AGEP",543
543,age,"60 + years & Some other race alone, Not Hispan...",60,120,population count,"RAC1P, HISP, AGEP",544


In [48]:
dim_reporting

Unnamed: 0,overall_segment,reporting_segment,reporting_segment_id
0,PUMS_acs5 population count data from 2014 to 2...,COVID-19 vaccine reporting - age groups,1
1,PUMS_acs5 population count data from 2015 to 2...,COVID-19 vaccine reporting - age groups,2
2,PUMS_acs5 population count data from 2014 to 2...,COVID-19 vaccine reporting - primary age brack...,3
3,PUMS_acs5 population count data from 2017 to 2...,MPX reporting,4
4,PUMS_acs5 population count data from 2014 to 2...,COVID-19 hospitalizations reporting - age brac...,5
5,PUMS_acs5 population count data from 2015 to 2...,COVID-19 vaccine reporting - primary age brackets,6
6,PUMS_acs5 population count data from 2015 to 2...,COVID-19 vaccine reporting - alternate age bra...,7
7,PUMS_acs5 population count data from 2015 to 2...,COVID-19 cases/testing reporting - age brackets,8
8,PUMS_acs5 population count data from 2015 to 2...,COVID-19 cases/testing reporting - age brackets,9
9,PUMS_acs5 population count data from 2015 to 2...,COVID-19 vaccine reporting - primary age brack...,10


In [49]:
print("dim_geography shape:", dim_geography.shape)
print("dim_demographic shape:", dim_demographic.shape)
print("dim_time shape:", dim_time.shape)
print("dim_source shape:", dim_source.shape)
print("dim_reporting shape:", dim_reporting.shape)

dim_geography shape: (1, 3)
dim_demographic shape: (545, 7)
dim_time shape: (6, 5)
dim_source shape: (210, 5)
dim_reporting shape: (56, 3)


In [50]:
fact_table

Unnamed: 0,geography_id,demographic_id,time_id,source_id,reporting_segment_id,estimate,se,moe,cv,reliable,upper_ci,lower_ci
0,1,1,1,1,1,757193.0,1148.231183,1888.840296,0.001516,True,759478.497000,754907.503000
1,1,1,1,1,2,757193.0,1148.231183,1888.840296,0.001516,True,759478.497000,754907.503000
2,1,1,1,1,25,757193.0,1148.231183,1888.840296,0.001516,True,759478.497000,754907.503000
3,1,1,1,1,42,757193.0,1148.231183,1888.840296,0.001516,True,759478.497000,754907.503000
4,1,1,1,2,1,757193.0,1148.231183,1888.840296,0.001516,True,759478.497000,754907.503000
...,...,...,...,...,...,...,...,...,...,...,...,...
1053775,1,41,6,210,26,38228.0,826.197706,1359.095227,0.021612,True,39872.505398,36583.494602
1053776,1,41,6,210,27,38228.0,826.197706,1359.095227,0.021612,True,39872.505398,36583.494602
1053777,1,41,6,210,43,38228.0,826.197706,1359.095227,0.021612,True,39872.505398,36583.494602
1053778,1,41,6,210,44,38228.0,826.197706,1359.095227,0.021612,True,39872.505398,36583.494602


In [53]:
dim_demographic['pums_variables'].value_counts()

pums_variables
RAC1P, HISP, AGEP         366
RAC1P, HISP, AGEP, SEX    141
AGEP                       38
Name: count, dtype: int64