In [1]:
import censusdis.data as ced
import pandas as pd
import string

In [2]:
pd.set_option('display.max_colwidth', None)

## Census categories

Here are some common census categories or classifications:

Demographic Characteristics:

* Age
* Sex
* Race
* Ethnicity (Hispanic or Latino origin)
* Marital status
* Household relationship

Economic Characteristics:

* Employment status
* Industry and occupation
* Income and earnings
* Poverty status
* Employment status

Social Characteristics:

* Educational attainment
* School enrollment
* Language spoken at home
* Ancestry
* Disability status
* Migration/Residence one year ago

Housing Characteristics:

* Housing occupancy (owned or rented)
* Housing value
* Rent
* Number of rooms and bedrooms
* Year structure built
* Plumbing and kitchen facilities

Geographical Characteristics:

* Urban and rural classification
* Geographic regions and divisions
* States, counties, and cities
* Census tracts and blocks

Family and Household Characteristics:

* Family composition
* Household size and type
* Presence of children
* Presence of elderly

In [3]:
def validate_xyear(var_variable, var_group, var_dataset, var_years):
    var_df = list() 
    for year in var_years:
        # identify whether group is a dictionary or string
        if isinstance(var_group, dict):
            group = var_group[year]
        elif isinstance(var_group, str):
            group = var_group
        # identify whether dataset is a dictionary or string
        if isinstance(var_dataset, dict):
            dataset = var_dataset[year]
        elif isinstance(var_dataset, str):
            dataset = var_dataset
        # identify whether variable is a dictionary or string
        if isinstance(var_variable, dict):
            variable = var_variable[year]
        elif isinstance(var_variable, str):
            variable = var_variable
        # make request
        var = ced.variables.all_variables(dataset, year, group)
        # make sure variable is in dataset
        if var[var.VARIABLE == variable].shape[0] == 0:
            print(f'{variable} not found in {dataset} for {year}')
            return []
        var = var[var.VARIABLE == variable]
        var_df.append(var)
    var_df = pd.concat(var_df)
    label = var_df.LABEL
    for token in string.punctuation:
        label = label.str.replace(token, ' ')
    # Remove multiple spaces
    label = label.str.replace(' +', ' ', regex=True)
    # Remove leading and trailing spaces
    label = label.str.strip()
    # Convert to small caps
    label = label.str.lower()
    return label.unique().tolist()

## ACS5 Datasets (year 2009)

In [4]:
df_datasets = ced.variables.all_data_sets(year=2009)
df_datasets[df_datasets['DATASET'].str.contains('acs5')][['SYMBOL', 'DATASET', 'TITLE']]

Unnamed: 0,SYMBOL,DATASET,TITLE
8,ACS5,acs/acs5,American Community Survey: 5-Year Estimates: Detailed Tables 5-Year
9,ACS5_PROFILE,acs/acs5/profile,American Community Survey: 5-Year Estimates: Data Profiles 5-Year
10,ACS5_PUMS,acs/acs5/pums,2005-2009 American Community Survey: 5-Year Estimates - Public Use Microdata Sample
11,ACS5_PUMSPR,acs/acs5/pumspr,2005-2009 American Community Survey: 5-Year Estimates - Puerto Rico Public Use Microdata Sample
12,,acs5,2005-2009 American Community Survey 5-Year Estimates


## ACS1 Datasets (year 2009)

In [5]:
df_datasets[df_datasets['DATASET'].str.contains('acs1')][['SYMBOL', 'DATASET', 'TITLE']]

Unnamed: 0,SYMBOL,DATASET,TITLE
0,ACS1,acs/acs1,American Community Survey: 1-Year Estimates: Detailed Tables 1-Year
1,ACS1_PROFILE,acs/acs1/profile,American Community Survey: 1-Year Estimates: Data Profiles 1-Year
2,ACS1_PUMS,acs/acs1/pums,2009 American Community Survey: 1-Year Estimates - Public Use Microdata Sample
3,ACS1_PUMSPR,acs/acs1/pumspr,2009 American Community Survey: 1-Year Estimates - Puerto Rico Public Use Microdata Sample
4,ACS1_SPP,acs/acs1/spp,American Community Survey: 1-Year Estimates: Selected Population Profiles 1-Year


## Decennial Datasets (year 2000)

In [6]:
df_datasets = ced.variables.all_data_sets(year=2000)
df_datasets[df_datasets['DATASET'].str.contains('dec')][['SYMBOL', 'DATASET', 'TITLE']]

Unnamed: 0,SYMBOL,DATASET,TITLE
4,CPS_BASIC_DEC,cps/basic/dec,Dec 2000 Current Population Survey: Basic Monthly
20,DECENNIAL_AIAN,dec/aian,Decennial Census: American Indian and Alaska Native Summary File
21,DECENNIAL_AIANPROFILE,dec/aianprofile,Decennial Census: American Indian and Alaska Native Demographic Profile
22,DECENNIAL_AS,dec/as,Decennial Census: American Samoa Summary File
23,DECENNIAL_CD110H,dec/cd110h,Decennial Census: 110th Congressional District Summary File (100-Percent)
24,DECENNIAL_CD110HPROFILE,dec/cd110hprofile,Decennial Census: 110th Congressional District Demographic Profile (100-Percent)
25,DECENNIAL_CD110S,dec/cd110s,Decennial Census: 110th Congressional District Summary File (Sample)
26,DECENNIAL_CD110SPROFILE,dec/cd110sprofile,Decennial Census: 110th Congressional District Demographic Profile (Sample)
27,DECENNIAL_CQR,dec/cqr,Decennial Census: Count Question Resolution
28,DECENNIAL_GU,dec/gu,Decennial Census: Guam Summary File


## Decennial Datasets (year 2010)

In [7]:
df_datasets = ced.variables.all_data_sets(year=2010)
df_datasets[df_datasets['DATASET'].str.contains('dec')][['SYMBOL', 'DATASET', 'TITLE']]

Unnamed: 0,SYMBOL,DATASET,TITLE
24,CPS_BASIC_DEC,cps/basic/dec,Dec 2010 Current Population Survey: Basic Monthly
37,CPS_FOODSEC_DEC,cps/foodsec/dec,Dec 2010 Current Population Survey: Food Security Supplement
45,DECENNIAL_AIAN,dec/aian,Decennial Census: American Indian and Alaska Native Summary File
46,DECENNIAL_AS,dec/as,Decennial Census: American Samoa Summary File
47,DECENNIAL_ASYOE,dec/asyoe,Decennial Census: American Samoa Year of Entry File
48,DECENNIAL_CD113,dec/cd113,Decennial CD113
49,DECENNIAL_CD113PROFILE,dec/cd113profile,Decennial Census: 113th Congressional District Demographic Profile
50,DECENNIAL_CD115,dec/cd115,Decennial CD115
51,DECENNIAL_CD115PROFILE,dec/cd115profile,Decennial Census: 115th Congressional District Demographic Profile
52,DECENNIAL_CD116,dec/cd116,Decennial Congressional District 116


## Decennial Datasets (year 2020)

In [8]:
df_datasets = ced.variables.all_data_sets(year=2020)
df_datasets[df_datasets['DATASET'].str.contains('dec')][['SYMBOL', 'DATASET', 'TITLE']]

Unnamed: 0,SYMBOL,DATASET,TITLE
18,CPS_BASIC_DEC,cps/basic/dec,Current Population Survey: Basic Monthly
30,CPS_FOODSEC_DEC,cps/foodsec/dec,Current Population Survey: Food Security Supplement
34,DECENNIAL_CD118,dec/cd118,Decennial Census: 118th Congressional District Summary File
35,DECENNIAL_CD119,dec/cd119,2020 Census 119th Congressional District
36,DECENNIAL_CROSSTABAS,dec/crosstabas,Decennial Census of Island Areas: American Samoa Detailed Crosstabulations
37,DECENNIAL_CROSSTABGU,dec/crosstabgu,Decennial Census of Island Areas: Guam Detailed Crosstabulations
38,DECENNIAL_CROSSTABMP,dec/crosstabmp,Decennial Census of Island Areas: Commonwealth of the Northern Mariana Islands Detailed Crosstabulations
39,DECENNIAL_CROSSTABVI,dec/crosstabvi,Decennial Census of Island Areas: U.S. Virgin Islands Detailed Crosstabulations
40,DECENNIAL_DDHCA,dec/ddhca,Decennial Census: Detailed Demographic and Housing Characteristics File A
41,DECENNIAL_DDHCB,dec/ddhcb,2020 Detailed Demographic and Housing Characteristics File B


## ACS5 Groups (year 2009)

In [9]:
acs_acs5_groups_2009 = ced.variables.all_groups('acs/acs5', year=2009)[['GROUP', 'DESCRIPTION']]
acs_acs5_groups_2009.to_csv('acs_acs5_groups_2009.csv', index=False)
acs_acs5_groups_2009

Unnamed: 0,GROUP,DESCRIPTION
0,B00001,UNWEIGHTED SAMPLE COUNT OF THE POPULATION
1,B00002,UNWEIGHTED SAMPLE HOUSING UNITS
2,B01001,SEX BY AGE
3,B01001A,SEX BY AGE (WHITE ALONE)
4,B01001B,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)
...,...,...
629,C23002H,"SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER (WHITE ALONE, NOT HISPANIC OR LATINO)"
630,C23002I,SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER (HISPANIC OR LATINO)
631,C24010,SEX BY OCCUPATION FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER
632,C24030,SEX BY INDUSTRY FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER


In [10]:
ced.variables.all_groups('acs/acs5/pums', year=2009)[['GROUP', 'DESCRIPTION']]

Unnamed: 0,GROUP,DESCRIPTION


In [11]:
ced.variables.all_groups('acs5', year=2009)[['GROUP', 'DESCRIPTION']]

Unnamed: 0,GROUP,DESCRIPTION


## ACS1 Groups (year 2007)

In [12]:
acs_acs1_groups_2007 = ced.variables.all_groups('acs/acs1', year=2007)[['GROUP', 'DESCRIPTION']]
acs_acs1_groups_2007.to_csv('acs_acs1_groups_2007.csv', index=False)
acs_acs1_groups_2007

Unnamed: 0,GROUP,DESCRIPTION
0,B00001,UNWEIGHTED SAMPLE COUNT OF THE POPULATION
1,B00002,UNWEIGHTED SAMPLE HOUSING UNITS
2,B01001,SEX BY AGE
3,B01001A,SEX BY AGE (WHITE ALONE)
4,B01001B,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)
...,...,...
1360,C25116,TENURE BY HOUSEHOLD SIZE BY AGE OF HOUSEHOLDER
1361,C25117,TENURE BY HOUSE HEATING FUEL
1362,C25121,HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2007 INFLATION-ADJUSTED DOLLARS) BY VALUE
1363,C25122,HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2007 INFLATION-ADJUSTED DOLLARS) BY GROSS RENT


In [13]:
ced.variables.all_groups('acs/acs1/profile', year=2007)[['GROUP', 'DESCRIPTION']]

Unnamed: 0,GROUP,DESCRIPTION
0,DP02,Selected Social Characteristics in the United States: 2007
1,DP02PR,Selected Social Characteristics in Puerto Rico: 2007
2,DP03,Selected Economic Characteristics: 2007
3,DP04,Selected Housing Characteristics: 2007
4,DP05,ACS Demographic and Housing Estimates: 2007


In [14]:
ced.variables.all_groups('acs/acs1/pums', year=2007)[['GROUP', 'DESCRIPTION']]

Unnamed: 0,GROUP,DESCRIPTION


In [15]:
# acs/acs1/spp is not available for 2007
ced.variables.all_groups('acs/acs1/spp', year=2008)[['GROUP', 'DESCRIPTION']]

Unnamed: 0,GROUP,DESCRIPTION
0,S0201,Selected Population Profile in the United States
1,S0201PR,Selected Population Profile in Puerto Rico


## Dec groups

In [16]:
datasets = {
    2000: ['sf1', 'sf2', 'sf3', 'sf4'],
    2010: ['sf1', 'sf2'],
    2020: ['dhc']
}
for year in [2000, 2010, 2020]:
    for dataset in datasets[year]:
        print(f'{dataset} {year}')
        df = ced.variables.all_groups(f'dec/{dataset}', year=year)[['GROUP', 'DESCRIPTION']]
        filename = f'dec_{dataset}_groups_{year}.csv'
        df.to_csv(filename, index=False)

sf1 2000
sf2 2000
sf3 2000
sf4 2000
sf1 2010
sf2 2010
dhc 2020


In [17]:
dec_sf1_groups_2000 = ced.variables.all_groups('dec/sf1', year=2000)[['GROUP', 'DESCRIPTION']]

### valid years

In [23]:
acs1_years = list(i for i in range(2007, 2022 + 1) if i != 2020)

# Census variables

### `population`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B01003'])
ced.variables.all_variables('acs/acs5', 2009, 'B01003')

In [None]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B01003'])
ced.variables.all_variables('acs/acs1', 2007, 'B01003')

In [None]:
var_group = {}
var_group[2000] = 'P001'
var_group[2010] = 'P1'
var_group[2020] = 'P1'

var_dataset = {}
var_dataset[2000] = 'dec/sf1'
var_dataset[2010] = 'dec/sf1'
var_dataset[2020] = 'dec/dhc'

var_df = []
for year in [2000, 2010, 2020]:
    dataset = var_dataset[year]
    group = var_group[year]
    print(f'{dataset} {year} {group}')
    var = ced.variables.all_variables(dataset, year, group)
    var_df.append(var)
var_df = pd.concat(var_df)
var_df.to_csv('dec_population.csv', index=False)

## `median_age`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B01002'])
ced.variables.all_variables('acs/acs5', 2010, 'B01002')

In [None]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B01002'])
ced.variables.all_variables('acs/acs1', 2005, 'B01002')

In [None]:
var_group = {}
var_group[2000] = 'P013'
var_group[2010] = 'P13'
var_group[2020] = 'P13'

var_dataset = {}
var_dataset[2000] = 'dec/sf1'
var_dataset[2010] = 'dec/sf1'
var_dataset[2020] = 'dec/dhc'

var_df = []
for year in [2000, 2010, 2020]:
    dataset = var_dataset[year]
    group = var_group[year]
    print(f'{dataset} {year} {group}')
    var = ced.variables.all_variables(dataset, year, group)
    var_df.append(var)
var_df = pd.concat(var_df)
var_df.to_csv('dec_median_age.csv', index=False)

## `pct_under_20_years`, `pct_20_35_years`, `pct_35_50_years`, `pct_50_65_years`, `pct_more_65_years`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B01001'])
ced.variables.all_variables('acs/acs5', 2009, 'B01001')[['VARIABLE', 'LABEL']]

In [None]:
valid_codes = [len(validate_xyear(f'B01001_{i:03}E', 'B01001', 'acs/acs5', range(2009, 2022 + 1))) == 1 for i in range(1, 50)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

In [None]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B01001'])
df = ced.variables.all_variables('acs/acs1', 2007, 'B01001')
df = df[df.VARIABLE.str.contains('B01001_')]
df[['VARIABLE', 'LABEL']]

In [None]:
valid_codes = [len(validate_xyear(f'B01001_{i:03}E', 'B01001', 'acs/acs1', acs1_years)) == 1 for i in range(1, 50)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

In [None]:
var_group = {}
var_group[2000] = 'P012'
var_group[2010] = 'P12'
var_group[2020] = 'P12'

var_dataset = {}
var_dataset[2000] = 'dec/sf1'
var_dataset[2010] = 'dec/sf1'
var_dataset[2020] = 'dec/dhc'

var_df = []
for year in [2000, 2010, 2020]:
    dataset = var_dataset[year]
    group = var_group[year]
    print(f'{dataset} {year} {group}')
    var = ced.variables.all_variables(dataset, year, group)
    var_df.append(var)
var_df = pd.concat(var_df)
var_df.to_csv('dec_sex_age.csv', index=False)


In [None]:
var_variable_prefix = {}
var_variable_prefix[2000] = 'P012'
var_variable_prefix[2010] = 'P012'
var_variable_prefix[2020] = 'P12_'

var_variable_suffix = {}
var_variable_suffix[2000] = ''
var_variable_suffix[2010] = ''
var_variable_suffix[2020] = 'N'

var_variable_list = []
for i in range(1, 49+1):
    var_variable = {}
    for year in [2000, 2010, 2020]:
        var_variable[year] = f'{var_variable_prefix[year]}{i:03d}{var_variable_suffix[year]}'
    var_variable_list.append(var_variable)
var_variable_list[-5:]

In [None]:
valid_codes = []
for i in range(49):
    val = validate_xyear(var_variable_list[i], var_group, var_dataset, [2000,2010,2020])
    valid_codes.append(len(val) == 1)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

In [33]:
# i = 1
# var_df = list()
# for year in [2000, 2010, 2020]:
#     dataset = var_dataset[year]
#     group = var_group[year]
#     variable_prefix = var_variable_prefix[year]
#     variable_suffix = var_variable_suffix[year]
#     variable = f'{variable_prefix}{i:03d}{variable_suffix}'
#     var = ced.variables.all_variables(dataset, year, group)
#     var = var[var.VARIABLE == variable]
#     if var.shape[0] == 0:
#         print(f'{variable} not found in {dataset} for {year}')
#     var_df.append(var)
# var_df = pd.concat(var_df)
# var_df

### `pct_white`, `pct_black`, `pct_asian`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B02001'])
ced.variables.all_variables('acs/acs5', 2009, 'B02001')[['VARIABLE', 'LABEL']]

In [None]:
valid_codes = [len(validate_xyear(f'B02001_{i:03}E', 'B02001', 'acs/acs5', range(2009, 2022+1))) == 1 for i in range(1, 11)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

In [None]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B02001'])
ced.variables.all_variables('acs/acs1', 2007, 'B02001')[['VARIABLE', 'LABEL']]

In [None]:
valid_codes = [len(validate_xyear(f'B02001_{i:03}E', 'B02001', 'acs/acs1', acs1_years)) == 1 for i in range(1, 11)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

In [None]:
ced.variables.all_variables('dec/sf1', 2010, 'P6')

In [None]:
var_group = {}
var_group[2000] = 'P007'
var_group[2010] = 'P8'
var_group[2020] = 'P8'

var_dataset = {}
var_dataset[2000] = 'dec/sf1'
var_dataset[2010] = 'dec/sf1'
var_dataset[2020] = 'dec/dhc'

var_df = []
for year in [2000, 2010, 2020]:
    dataset = var_dataset[year]
    group = var_group[year]
    print(f'{dataset} {year} {group}')
    var = ced.variables.all_variables(dataset, year, group)
    var_df.append(var)
var_df = pd.concat(var_df)
var_df.to_csv('dec_race.csv', index=False)

In [None]:
var_variable_prefix = {}
var_variable_prefix[2000] = 'P007'
var_variable_prefix[2010] = 'P008'
var_variable_prefix[2020] = 'P8_'

var_variable_suffix = {}
var_variable_suffix[2000] = ''
var_variable_suffix[2010] = ''
var_variable_suffix[2020] = 'N'

var_variable_list = []
for i in range(1, 8+1):
    var_variable = {}
    for year in [2000, 2010, 2020]:
        var_variable[year] = f'{var_variable_prefix[year]}{i:03d}{var_variable_suffix[year]}'
    var_variable_list.append(var_variable)
var_variable_list[-5:]

In [None]:
var_df = list()
for i in range(1, 9):
    for year in [2000, 2010, 2020]:
        dataset = var_dataset[year]
        group = var_group[year]
        variable_prefix = var_variable_prefix[year]
        variable_suffix = var_variable_suffix[year]
        variable = f'{variable_prefix}{i:03d}{variable_suffix}'
        var = ced.variables.all_variables(dataset, year, group)
        var = var[var.VARIABLE == variable]
        if var.shape[0] == 0:
            print(f'{variable} not found in {dataset} for {year}')
        label = var.LABEL
        for token in string.punctuation:
            label = label.str.replace(token, ' ')
        # Remove multiple spaces
        label = label.str.replace(' +', ' ', regex=True)
        # Remove leading and trailing spaces
        label = label.str.strip()
        # Convert to small caps
        label = label.str.lower()
        var['LABEL'] = label
        var_df.append(var[['YEAR', 'DATASET', 'GROUP', 'VARIABLE', 'LABEL']])
var_df = pd.concat(var_df)
var_df

### `pct_hispanic`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B03003'])
ced.variables.all_variables('acs/acs5', 2010, 'B03003')[['VARIABLE', 'LABEL']]

In [None]:
valid_codes = [len(validate_xyear(f'B03003_{i:03}E', 'B03003', 'acs/acs5', range(2010, 2022+1))) == 1 for i in range(1, 4)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

In [None]:
# hispanic is not available until 2009
ced.variables.all_variables('acs/acs1', 2009, 'B03003')[['VARIABLE', 'LABEL']]

In [None]:
hispanic_acs1_years = list(i for i in range(2009, 2022 + 1) if i != 2020)
valid_codes = [len(validate_xyear(f'B03003_{i:03}E', 'B03003', 'acs/acs5', hispanic_acs1_years)) == 1 for i in range(1, 4)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

In [None]:
var_group = {}
var_group[2000] = 'P011'
var_group[2010] = 'P4'
var_group[2020] = 'P4'

var_dataset = {}
var_dataset[2000] = 'dec/sf1'
var_dataset[2010] = 'dec/sf1'
var_dataset[2020] = 'dec/dhc'

var_df = []
for year in [2000, 2010, 2020]:
    dataset = var_dataset[year]
    group = var_group[year]
    print(f'{dataset} {year} {group}')
    var = ced.variables.all_variables(dataset, year, group)
    var_df.append(var)
var_df = pd.concat(var_df)
var_df.to_csv('dec_hispanic.csv', index=False)

### `pct_non_us_citizen`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B05001'])
ced.variables.all_variables('acs/acs5', 2009, 'B05001')[['VARIABLE', 'LABEL']]

In [None]:
valid_codes = [len(validate_xyear(f'B05001_{i:03}E', 'B05001', 'acs/acs5', range(2009, 2022+1))) == 1 for i in range(1, 7)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

In [None]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B05001'])
ced.variables.all_variables('acs/acs1', 2007, 'B05001')[['VARIABLE', 'LABEL']]

In [None]:
valid_codes = [len(validate_xyear(f'B05001_{i:03}E', 'B05001', 'acs/acs1', acs1_years)) == 1 for i in range(1, 7)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

In [None]:
# In most recent censuses, 
# citizenship data is Not collected through the Decennial Census short form, 
# which is the primary form sent to all households.

ced.variables.all_variables('dec/sf3', 2000, 'P021')

### `pct_higher_education`

In [18]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B14001'])
ced.variables.all_variables('acs/acs5', 2010, 'B14001')

      GROUP  \
173  B14001   

                                                                  DESCRIPTION  
173  SCHOOL ENROLLMENT BY LEVEL OF SCHOOL FOR THE POPULATION 3 YEARS AND OVER  


Unnamed: 0,YEAR,DATASET,GROUP,VARIABLE,LABEL,SUGGESTED_WEIGHT,VALUES
0,2010,acs/acs5,B14001,B14001_001E,Estimate!!Total,,
1,2010,acs/acs5,B14001,B14001_002E,Estimate!!Total!!Enrolled in school,,
2,2010,acs/acs5,B14001,B14001_003E,"Estimate!!Total!!Enrolled in school!!Enrolled in nursery school, preschool",,
3,2010,acs/acs5,B14001,B14001_004E,Estimate!!Total!!Enrolled in school!!Enrolled in kindergarten,,
4,2010,acs/acs5,B14001,B14001_005E,Estimate!!Total!!Enrolled in school!!Enrolled in grade 1 to grade 4,,
5,2010,acs/acs5,B14001,B14001_006E,Estimate!!Total!!Enrolled in school!!Enrolled in grade 5 to grade 8,,
6,2010,acs/acs5,B14001,B14001_007E,Estimate!!Total!!Enrolled in school!!Enrolled in grade 9 to grade 12,,
7,2010,acs/acs5,B14001,B14001_008E,"Estimate!!Total!!Enrolled in school!!Enrolled in college, undergraduate years",,
8,2010,acs/acs5,B14001,B14001_009E,Estimate!!Total!!Enrolled in school!!Graduate or professional school,,
9,2010,acs/acs5,B14001,B14001_010E,Estimate!!Total!!Not enrolled in school,,


In [19]:
valid_codes = [len(validate_xyear(f'B14001_{i:03}E', 'B14001', 'acs/acs5', range(2010, 2022+1))) == 1 for i in range(1, 11)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

[True, True, True, True, True, True, True, True, True, True]
total codes: 10
total valid codes: 10


In [20]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B14001'])
ced.variables.all_variables('acs/acs1', 2007, 'B14001')[['VARIABLE', 'LABEL']]

      GROUP  \
360  B14001   

                                                                  DESCRIPTION  
360  SCHOOL ENROLLMENT BY LEVEL OF SCHOOL FOR THE POPULATION 3 YEARS AND OVER  


Unnamed: 0,VARIABLE,LABEL
0,B14001_001E,Estimate!!Total
1,B14001_002E,Estimate!!Total!!Enrolled in school
2,B14001_003E,"Estimate!!Total!!Enrolled in school!!Enrolled in nursery school, preschool"
3,B14001_004E,Estimate!!Total!!Enrolled in school!!Enrolled in kindergarten
4,B14001_005E,Estimate!!Total!!Enrolled in school!!Enrolled in grade 1 to grade 4
5,B14001_006E,Estimate!!Total!!Enrolled in school!!Enrolled in grade 5 to grade 8
6,B14001_007E,Estimate!!Total!!Enrolled in school!!Enrolled in grade 9 to grade 12
7,B14001_008E,"Estimate!!Total!!Enrolled in school!!Enrolled in college, undergraduate years"
8,B14001_009E,Estimate!!Total!!Enrolled in school!!Graduate or professional school
9,B14001_010E,Estimate!!Total!!Not enrolled in school


In [24]:
valid_codes = [len(validate_xyear(f'B14001_{i:03}E', 'B14001', 'acs/acs1', acs1_years)) == 1 for i in range(1, 11)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

[True, True, True, True, True, True, True, True, True, True]
total codes: 10
total valid codes: 10


In [22]:
ced.variables.all_variables('dec/sf3', 2000, 'P036')

Unnamed: 0,YEAR,DATASET,GROUP,VARIABLE,LABEL,SUGGESTED_WEIGHT,VALUES
0,2000,dec/sf3,P036,GEO_ID,Geography,,
1,2000,dec/sf3,P036,NAME,Label for GEO_ID,,
2,2000,dec/sf3,P036,P036001,Total,,
3,2000,dec/sf3,P036,P036002,Total!!Male,,
4,2000,dec/sf3,P036,P036003,"Total!!Male!!Enrolled in nursery school, preschool",,
5,2000,dec/sf3,P036,P036004,"Total!!Male!!Enrolled in nursery school, preschool!!Public school",,
6,2000,dec/sf3,P036,P036005,"Total!!Male!!Enrolled in nursery school, preschool!!Private school",,
7,2000,dec/sf3,P036,P036006,Total!!Male!!Enrolled in kindergarten,,
8,2000,dec/sf3,P036,P036007,Total!!Male!!Enrolled in kindergarten!!Public school,,
9,2000,dec/sf3,P036,P036008,Total!!Male!!Enrolled in kindergarten!!Private school,,


## pop_higher_education

In [26]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B15002'])
df = ced.variables.all_variables('acs/acs5', 2009, 'B15002')
df.to_csv('acs_acs5_education.csv', index=False)

      GROUP  \
179  B15002   

                                                            DESCRIPTION  
179  SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER  


In [27]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B15002'])
#ced.variables.all_variables('acs/acs1', 2007, 'B15001')[['VARIABLE', 'LABEL']]

      GROUP  \
376  B15002   

                                                            DESCRIPTION  
376  SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER  


In [28]:
ced.variables.all_variables('dec/sf3', 2000, 'P037')

Unnamed: 0,YEAR,DATASET,GROUP,VARIABLE,LABEL,SUGGESTED_WEIGHT,VALUES
0,2000,dec/sf3,P037,GEO_ID,Geography,,
1,2000,dec/sf3,P037,NAME,Label for GEO_ID,,
2,2000,dec/sf3,P037,P037001,Total,,
3,2000,dec/sf3,P037,P037002,Total!!Male,,
4,2000,dec/sf3,P037,P037003,Total!!Male!!No schooling completed,,
5,2000,dec/sf3,P037,P037004,Total!!Male!!Nursery to 4th grade,,
6,2000,dec/sf3,P037,P037005,Total!!Male!!5th and 6th grade,,
7,2000,dec/sf3,P037,P037006,Total!!Male!!7th and 8th grade,,
8,2000,dec/sf3,P037,P037007,Total!!Male!!9th grade,,
9,2000,dec/sf3,P037,P037008,Total!!Male!!10th grade,,


### `pct_poverty`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B17001'])
ced.variables.all_variables('acs/acs5', 2010, 'B17001')[['VARIABLE', 'LABEL']]

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B17025'])
ced.variables.all_variables('acs/acs5', 2010, 'B17025')[['VARIABLE', 'LABEL']]

In [None]:
# The variable B17025_002E has a unique label across years
validate_xyear(f'B17025_002E', 'B17025', 'acs/acs5', range(2010, 2022+1))

In [None]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B17025'])
ced.variables.all_variables('acs/acs1', 2007, 'B17025')[['VARIABLE', 'LABEL']]

In [None]:
# The variable B17025_002E has a unique label across years
validate_xyear(f'B17025_002E', 'B17025', 'acs/acs1', acs1_years)

In [None]:
#P087,POVERTY STATUS IN 1999 BY AGE [17]
ced.variables.all_variables('dec/sf3', 2000, 'P087')[['VARIABLE', 'LABEL']]

### `pct_high_income`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B19001'])
ced.variables.all_variables('acs/acs5', 2010, 'B19001')[['VARIABLE', 'LABEL']]

In [None]:
valid_codes = [len(validate_xyear(f'B19001_{i:03}E', 'B19001', 'acs/acs5', range(2010, 2022+1))) == 1 for i in range(1, 18)]
print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

### `median_household_income`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B19013'])
ced.variables.all_variables('acs/acs5', 2010, 'B19013')[['VARIABLE', 'LABEL']]

In [None]:
# the variable name changes because income is not discounted for inflation
validate_xyear(f'B19013_001E', 'B19013', 'acs/acs5', range(2010, 2022+1))

In [None]:
# In most recent censuses, 
# income data is Not collected through the Decennial Census short form, 
# which is the primary form sent to all households.

ced.variables.all_variables('dec/sf3', 2000, 'P053')

### `pct_family_high_income`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B19101'])
ced.variables.all_variables('acs/acs5', 2009, 'B19101')

In [None]:
# In most recent censuses, 
# income data is Not collected through the Decennial Census short form, 
# which is the primary form sent to all households.

ced.variables.all_variables('dec/sf3', 2000, 'P052')

### `median_family_income`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B19113'])
ced.variables.all_variables('acs/acs5', 2009, 'B19113')[['VARIABLE', 'LABEL']]

### `elder_footstamp`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B22001'])
ced.variables.all_variables('acs/acs5', 2009, 'B22001')[['VARIABLE', 'LABEL']]

In [None]:
# The variable B22001_002E has a unique label across years
validate_xyear(f'B22001_002E', 'B22001', 'acs/acs5', range(2009, 2022+1))

In [None]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B22001'])
ced.variables.all_variables('acs/acs1', 2007, 'B22001')[['VARIABLE', 'LABEL']]

In [None]:
# The variable B22001_002E has a label change, but not representative of a different concept
validate_xyear(f'B22001_002E', 'B22001', 'acs/acs1', acs1_years)

### `median_household_income`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B19013'])
ced.variables.all_variables('acs/acs5', 2009, 'B19013')[['VARIABLE', 'LABEL']]

In [None]:
validate_xyear(f'B19013_001E', 'B19013', 'acs/acs5', range(2009, 2022+1))

In [None]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B19013'])
ced.variables.all_variables('acs/acs1', 2007, 'B19013')[['VARIABLE', 'LABEL']]


In [None]:
validate_xyear(f'B19013_001E', 'B19013', 'acs/acs1', acs1_years)

In [None]:
ced.variables.all_variables('dec/sf3', 2000, 'P053')
# Starting with the 2010 Census, the long-form survey was discontinued, 
# and the data previously collected under SF3 became part of the American Community Survey (ACS).

### `median_home_value`

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B25077'])
ced.variables.all_variables('acs/acs5', 2009, 'B25077')[['VARIABLE', 'LABEL']]

In [None]:
validate_xyear('B25077_001E', 'B25077', 'acs/acs5', range(2009, 2022+1))

In [None]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B25077'])
ced.variables.all_variables('acs/acs1', 2007, 'B25077')[['VARIABLE', 'LABEL']]

In [None]:
validate_xyear(f'B25077_001E', 'B25077', 'acs/acs1', acs1_years)

In [None]:
ced.variables.all_variables('dec/sf3', 2000, 'H085')

# Starting with the 2010 Census, the long-form survey was discontinued, 
# and the data previously collected under SF3 became part of the American Community Survey (ACS).

In [None]:
ced.variables.all_variables('dec/sf1', 2010, 'H1')

In [None]:
ced.variables.all_variables('dec/dhc', 2020, 'H1')

## Unweighted tables

The unweighted tables in the American Community Survey (ACS) 

* Provide context for other ACS estimates by showing the base sample size. For instance, when presenting data on economic characteristics from the ACS, you can reference the unweighted sample count to give users a sense of the underlying sample from which estimates are derived. 
* Comparing the unweighted sample counts across different geographic areas can help in understanding the distribution and density of the survey samples.
* They are particularly useful in the following situations:

    - Understanding Sample Size: Use this table to determine the sample size of the population surveyed in a specific area. This can help assess the reliability and precision of the estimates in other ACS tables.
    - Evaluating Data Quality: When analyzing ACS data, it’s important to understand the sample size because areas with small sample sizes may have less reliable estimates. This table helps in identifying such areas.
    - Weighting and Estimation: Use this table to understand the raw, unweighted sample counts before weights are applied to produce population estimates. This is crucial for researchers performing advanced statistical analyses.



In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B00001'])
ced.variables.all_variables('acs/acs5', 2010, 'B00001')[['VARIABLE', 'LABEL']]

In [None]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B00002'])
ced.variables.all_variables('acs/acs5', 2010, 'B00002')