In [1]:
import censusdis.data as ced
import pandas as pd
import string

In [34]:
pd.set_option('display.max_colwidth', None)

# Foreign Born Explorer

In [2]:
def validate_xyear(var_variable, var_group, var_dataset, var_years):
    var_df = list() 
    for year in var_years:
        # identify whether group is a dictionary or string
        if isinstance(var_group, dict):
            group = var_group[year]
        elif isinstance(var_group, str):
            group = var_group
        # identify whether dataset is a dictionary or string
        if isinstance(var_dataset, dict):
            dataset = var_dataset[year]
        elif isinstance(var_dataset, str):
            dataset = var_dataset
        # identify whether variable is a dictionary or string
        if isinstance(var_variable, dict):
            variable = var_variable[year]
        elif isinstance(var_variable, str):
            variable = var_variable
        # make request
        var = ced.variables.all_variables(dataset, year, group)
        # make sure variable is in dataset
        if var[var.VARIABLE == variable].shape[0] == 0:
            print(f'{variable} not found in {dataset} for {year}')
            return []
        var = var[var.VARIABLE == variable]
        var_df.append(var)
    var_df = pd.concat(var_df)
    label = var_df.LABEL
    for token in string.punctuation:
        label = label.str.replace(token, ' ')
    # Remove multiple spaces
    label = label.str.replace(' +', ' ', regex=True)
    # Remove leading and trailing spaces
    label = label.str.strip()
    # Convert to small caps
    label = label.str.lower()
    return label.unique().tolist()

In [3]:
acs_acs5_groups_2009 = ced.variables.all_groups('acs/acs5', year=2009)[['GROUP', 'DESCRIPTION']]
acs_acs5_groups_2009.to_csv('notes/acs_acs5_groups_2009.csv', index=False)
acs_acs5_groups_2009.head()

Unnamed: 0,GROUP,DESCRIPTION
0,B00001,UNWEIGHTED SAMPLE COUNT OF THE POPULATION
1,B00002,UNWEIGHTED SAMPLE HOUSING UNITS
2,B01001,SEX BY AGE
3,B01001A,SEX BY AGE (WHITE ALONE)
4,B01001B,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)


In [4]:
acs_acs5_groups_2016 = ced.variables.all_groups('acs/acs5', year=2016)[['GROUP', 'DESCRIPTION']]
acs_acs5_groups_2016.head()

Unnamed: 0,GROUP,DESCRIPTION
0,B00001,UNWEIGHTED SAMPLE COUNT OF THE POPULATION
1,B00002,UNWEIGHTED SAMPLE HOUSING UNITS
2,B01001,SEX BY AGE
3,B01001A,SEX BY AGE (WHITE ALONE)
4,B01001B,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)


In [5]:
acs_acs1_groups_2007 = ced.variables.all_groups('acs/acs1', year=2007)[['GROUP', 'DESCRIPTION']]
acs_acs1_groups_2007.to_csv('notes/acs_acs1_groups_2007.csv', index=False)
acs_acs1_groups_2007.head()

Unnamed: 0,GROUP,DESCRIPTION
0,B00001,UNWEIGHTED SAMPLE COUNT OF THE POPULATION
1,B00002,UNWEIGHTED SAMPLE HOUSING UNITS
2,B01001,SEX BY AGE
3,B01001A,SEX BY AGE (WHITE ALONE)
4,B01001B,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)


In [6]:
acs_acs1_groups_2016 = ced.variables.all_groups('acs/acs1', year=2016)[['GROUP', 'DESCRIPTION']]
acs_acs1_groups_2016.head()

Unnamed: 0,GROUP,DESCRIPTION
0,B00001,UNWEIGHTED SAMPLE COUNT OF THE POPULATION
1,B00002,UNWEIGHTED SAMPLE HOUSING UNITS
2,B01001,SEX BY AGE
3,B01001A,SEX BY AGE (WHITE ALONE)
4,B01001B,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)


## Total foreign born (Place of birth)

In [7]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B05006'])
birth = ced.variables.all_variables('acs/acs5', 2009, 'B05006')[['VARIABLE', 'LABEL']]
birth.to_csv('notes/B05006_variables.csv', index=False)
birth.head()

     GROUP                                     DESCRIPTION
47  B05006  PLACE OF BIRTH FOR THE FOREIGN-BORN POPULATION


Unnamed: 0,VARIABLE,LABEL
0,B05006_001E,Estimate!!Total
1,B05006_002E,Estimate!!Total!!Europe
2,B05006_003E,Estimate!!Total!!Europe!!Northern Europe
3,B05006_004E,Estimate!!Total!!Europe!!Northern Europe!!Unit...
4,B05006_005E,Estimate!!Total!!Europe!!Northern Europe!!Unit...


In [8]:
validate_xyear(f'B05006_001E', 'B05006', 'acs/acs5', range(2009, 2022+1))

['estimate total']

In [9]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B05006'])
ced.variables.all_variables('acs/acs1', 2007, 'B05006')[['VARIABLE', 'LABEL']].head()

     GROUP                                     DESCRIPTION
59  B05006  PLACE OF BIRTH FOR THE FOREIGN-BORN POPULATION


Unnamed: 0,VARIABLE,LABEL
0,B05006_001E,Estimate!!Total
1,B05006_002E,Estimate!!Total!!Europe
2,B05006_003E,Estimate!!Total!!Europe!!Northern Europe
3,B05006_004E,Estimate!!Total!!Europe!!Northern Europe!!Unit...
4,B05006_005E,Estimate!!Total!!Europe!!Northern Europe!!Unit...


In [10]:
years_acs1_list = list(range(2009, 2022+1))
years_acs1_list.remove(2020)
validate_xyear(f'B05006_001E', 'B05006', 'acs/acs1', years_acs1_list)

['estimate total']

## English "less than well" (language spoken at home)

In [11]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B16001'])
language_acs5_2009 = ced.variables.all_variables('acs/acs5', 2009, 'B16001')[['VARIABLE', 'LABEL']]
language_acs5_2009.to_csv('notes/B16001_2009_variables.csv', index=False)
language_acs5_2009.head()

      GROUP                                        DESCRIPTION
180  B16001  LANGUAGE SPOKEN AT HOME BY ABILITY TO SPEAK EN...


Unnamed: 0,VARIABLE,LABEL
0,B16001_001E,Estimate!!Total
1,B16001_002E,Estimate!!Total!!Speak only English
2,B16001_003E,Estimate!!Total!!Spanish or Spanish Creole
3,B16001_004E,Estimate!!Total!!Spanish or Spanish Creole!!Sp...
4,B16001_005E,Estimate!!Total!!Spanish or Spanish Creole!!Sp...


In [12]:
# identify less than very well variables

label = language_acs5_2009.LABEL
for token in string.punctuation:
    label = label.str.replace(token, ' ')
# Remove multiple spaces
label = label.str.replace(' +', ' ', regex=True)
# Remove leading and trailing spaces
label = label.str.strip()
# Convert to small caps
label = label.str.lower()

language_acs5_2009['LABEL'] = label

less_than_list = language_acs5_2009.VARIABLE[label.str.contains('less than very well')].to_list()
less_than_list[:5]

['B16001_005E', 'B16001_008E', 'B16001_011E', 'B16001_014E', 'B16001_017E']

In [13]:
# years 2009 to 2015 share the same variable IDs

valid_codes = [len(validate_xyear(var_id, 'B16001', 'acs/acs5', range(2009, 2015+1))) == 1 for var_id in less_than_list]
#print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

total codes: 39
total valid codes: 39


In [14]:
# identify codes 2016 onwards
print(acs_acs5_groups_2016[acs_acs5_groups_2016['GROUP'] == 'B16001'])
language_acs5_2016 = ced.variables.all_variables('acs/acs5', 2016, 'B16001')[['VARIABLE', 'LABEL']]
language_acs5_2016.to_csv('notes/B16001_2016_variables.csv', index=False)

# identify less than very well variables

label = language_acs5_2016.LABEL
for token in string.punctuation:
    label = label.str.replace(token, ' ')
# Remove multiple spaces
label = label.str.replace(' +', ' ', regex=True)
# Remove leading and trailing spaces
label = label.str.strip()
# Convert to small caps
label = label.str.lower()

language_acs5_2016['LABEL'] = label

less_than_list = language_acs5_2016.VARIABLE[label.str.contains('less than very well')].to_list()
less_than_list[:5]

      GROUP                                        DESCRIPTION
396  B16001  LANGUAGE SPOKEN AT HOME BY ABILITY TO SPEAK EN...


['B16001_005E', 'B16001_008E', 'B16001_011E', 'B16001_014E', 'B16001_017E']

In [15]:
# years 2016 onwards share the same variable IDs

valid_codes = [len(validate_xyear(var_id, 'B16001', 'acs/acs5', range(2016, 2022+1))) == 1 for var_id in less_than_list]
#print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

total codes: 42
total valid codes: 42


In [16]:
print(acs_acs1_groups_2007[acs_acs1_groups_2007['GROUP'] == 'B16001'])
language_acs1_2007 = ced.variables.all_variables('acs/acs1', 2007, 'B16001')[['VARIABLE', 'LABEL']]

# identify less than very well variables

label = language_acs1_2007.LABEL
for token in string.punctuation:
    label = label.str.replace(token, ' ')
# Remove multiple spaces
label = label.str.replace(' +', ' ', regex=True)
# Remove leading and trailing spaces
label = label.str.strip()
# Convert to small caps
label = label.str.lower()

language_acs1_2007['LABEL'] = label

less_than_list = language_acs1_2007.VARIABLE[label.str.contains('less than very well')].to_list()
less_than_list[:5]

# years 2009 to 2015 share the same variable IDs

valid_codes = [len(validate_xyear(var_id, 'B16001', 'acs/acs1', range(2007, 2015+1))) == 1 for var_id in less_than_list]

#print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

      GROUP                                        DESCRIPTION
387  B16001  LANGUAGE SPOKEN AT HOME BY ABILITY TO SPEAK EN...
total codes: 39
total valid codes: 38


In [17]:
print(acs_acs1_groups_2016[acs_acs1_groups_2016['GROUP'] == 'B16001'])
language_acs1_2016 = ced.variables.all_variables('acs/acs1', 2016, 'B16001')[['VARIABLE', 'LABEL']]

# identify less than very well variables

label = language_acs1_2016.LABEL
for token in string.punctuation:
    label = label.str.replace(token, ' ')
# Remove multiple spaces
label = label.str.replace(' +', ' ', regex=True)
# Remove leading and trailing spaces
label = label.str.strip()
# Convert to small caps
label = label.str.lower()

language_acs1_2016['LABEL'] = label

less_than_list = language_acs1_2016.VARIABLE[label.str.contains('less than very well')].to_list()
less_than_list[:5]

# years 2009 to 2015 share the same variable IDs
years = list(range(2016, 2022+1))
years.remove(2020)
valid_codes = [len(validate_xyear(var_id, 'B16001', 'acs/acs1', years)) == 1 for var_id in less_than_list]

#print(valid_codes)
print("total codes:", len(valid_codes))
print("total valid codes:", sum(valid_codes))

      GROUP                                        DESCRIPTION
400  B16001  LANGUAGE SPOKEN AT HOME BY ABILITY TO SPEAK EN...
total codes: 42
total valid codes: 42


## Spanish speaking (language spoken at home)

In [18]:
print(language_acs5_2009[language_acs5_2009.VARIABLE == 'B16001_003E'])
print(language_acs5_2016[language_acs5_2016.VARIABLE == 'B16001_003E'])

      VARIABLE                                     LABEL
2  B16001_003E  estimate total spanish or spanish creole
      VARIABLE                   LABEL
2  B16001_003E  estimate total spanish


In [19]:
print(language_acs1_2007[language_acs1_2007.VARIABLE == 'B16001_003E'])
print(language_acs1_2016[language_acs1_2016.VARIABLE == 'B16001_003E'])

      VARIABLE                                     LABEL
2  B16001_003E  estimate total spanish or spanish creole
      VARIABLE                   LABEL
2  B16001_003E  estimate total spanish


## Spanish and speaks English less than very well (language spoken at home)

In [20]:
print(language_acs5_2009.LABEL[language_acs5_2009.VARIABLE == 'B16001_005E'].values)
print(language_acs5_2016.LABEL[language_acs5_2016.VARIABLE == 'B16001_005E'].values)

['estimate total spanish or spanish creole speak english less than very well']
['estimate total spanish speak english less than very well']


In [21]:
print(language_acs1_2007.LABEL[language_acs1_2007.VARIABLE == 'B16001_005E'].values)
print(language_acs1_2016.LABEL[language_acs1_2016.VARIABLE == 'B16001_005E'].values)

['estimate total spanish or spanish creole speak english less than very well']
['estimate total spanish speak english less than very well']


## Asian and pacific (language spoken at home by nativity)

In [22]:
language_acs5_2009 = ced.variables.all_variables('acs/acs5', 2009, 'B16005')[['VARIABLE', 'LABEL']]
language_acs5_2009.to_csv('notes/B16005_2009_variables.csv', index=False)
language_acs5_2016 = ced.variables.all_variables('acs/acs5', 2016, 'B16005')[['VARIABLE', 'LABEL']]
language_acs5_2016.to_csv('notes/B16005_2016_variables.csv', index=False)

In [23]:
print(language_acs5_2009.LABEL[language_acs5_2009.VARIABLE == 'B16005_014E'].values)
print(language_acs5_2009.LABEL[language_acs5_2009.VARIABLE == 'B16005_036E'].values)
print(language_acs5_2016.LABEL[language_acs5_2016.VARIABLE == 'B16005_014E'].values)
print(language_acs5_2016.LABEL[language_acs5_2016.VARIABLE == 'B16005_036E'].values)

['Estimate!!Total!!Native!!Speak Asian and Pacific Island languages']
['Estimate!!Total!!Foreign born!!Speak Asian and Pacific Island languages']
['Estimate!!Total!!Native!!Speak Asian and Pacific Island languages']
['Estimate!!Total!!Foreign born!!Speak Asian and Pacific Island languages']


In [24]:
validate_xyear('B16005_014E', 'B16005', 'acs/acs5', range(2009, 2022+1))

['estimate total native speak asian and pacific island languages']

In [25]:
validate_xyear('B16005_036E', 'B16005', 'acs/acs5', range(2009, 2022+1))

['estimate total foreign born speak asian and pacific island languages']

In [28]:
validate_xyear('B16005_014E', 'B16005', 'acs/acs1', years_acs1_list)

['estimate total native speak asian and pacific island languages']

In [29]:
validate_xyear('B16005_036E', 'B16005', 'acs/acs5', years_acs1_list)

['estimate total foreign born speak asian and pacific island languages']

## Asian and pacific and speaks English less than very well (language spoken at home by nativity)

In [30]:
print(language_acs5_2009.LABEL[language_acs5_2009.VARIABLE == 'B16005_016E'].values)
print(language_acs5_2009.LABEL[language_acs5_2009.VARIABLE == 'B16005_017E'].values)
print(language_acs5_2009.LABEL[language_acs5_2009.VARIABLE == 'B16005_018E'].values)
print(language_acs5_2009.LABEL[language_acs5_2009.VARIABLE == 'B16005_038E'].values)
print(language_acs5_2009.LABEL[language_acs5_2009.VARIABLE == 'B16005_039E'].values)
print(language_acs5_2009.LABEL[language_acs5_2009.VARIABLE == 'B16005_040E'].values)

['Estimate!!Total!!Native!!Speak Asian and Pacific Island languages!!Speak English "well"']
['Estimate!!Total!!Native!!Speak Asian and Pacific Island languages!!Speak English "not well"']
['Estimate!!Total!!Native!!Speak Asian and Pacific Island languages!!Speak English "not at all"']
['Estimate!!Total!!Foreign born!!Speak Asian and Pacific Island languages!!Speak English "well"']
['Estimate!!Total!!Foreign born!!Speak Asian and Pacific Island languages!!Speak English "not well"']
['Estimate!!Total!!Foreign born!!Speak Asian and Pacific Island languages!!Speak English "not at all"']


In [31]:
print(validate_xyear('B16005_016E', 'B16005', 'acs/acs5', range(2009, 2022+1)))
print(validate_xyear('B16005_017E', 'B16005', 'acs/acs5', range(2009, 2022+1)))
print(validate_xyear('B16005_018E', 'B16005', 'acs/acs5', range(2009, 2022+1)))
print(validate_xyear('B16005_038E', 'B16005', 'acs/acs5', range(2009, 2022+1)))
print(validate_xyear('B16005_039E', 'B16005', 'acs/acs5', range(2009, 2022+1)))
print(validate_xyear('B16005_040E', 'B16005', 'acs/acs5', range(2009, 2022+1)))

['estimate total native speak asian and pacific island languages speak english well']
['estimate total native speak asian and pacific island languages speak english not well']
['estimate total native speak asian and pacific island languages speak english not at all']
['estimate total foreign born speak asian and pacific island languages speak english well']
['estimate total foreign born speak asian and pacific island languages speak english not well']
['estimate total foreign born speak asian and pacific island languages speak english not at all']


In [32]:
print(validate_xyear('B16005_016E', 'B16005', 'acs/acs1', years_acs1_list))
print(validate_xyear('B16005_017E', 'B16005', 'acs/acs1', years_acs1_list))
print(validate_xyear('B16005_018E', 'B16005', 'acs/acs1', years_acs1_list))
print(validate_xyear('B16005_038E', 'B16005', 'acs/acs1', years_acs1_list))
print(validate_xyear('B16005_039E', 'B16005', 'acs/acs1', years_acs1_list))
print(validate_xyear('B16005_040E', 'B16005', 'acs/acs1', years_acs1_list))

['estimate total native speak asian and pacific island languages speak english well']
['estimate total native speak asian and pacific island languages speak english not well']
['estimate total native speak asian and pacific island languages speak english not at all']
['estimate total foreign born speak asian and pacific island languages speak english well']
['estimate total foreign born speak asian and pacific island languages speak english not well']
['estimate total foreign born speak asian and pacific island languages speak english not at all']


## Not Hispanic asian alone 

In [35]:
print(acs_acs5_groups_2009[acs_acs5_groups_2009['GROUP'] == 'B03002'])
ced.variables.all_variables('acs/acs5', 2009, 'B03002')[['VARIABLE', 'LABEL']]


     GROUP                        DESCRIPTION
34  B03002  HISPANIC OR LATINO ORIGIN BY RACE


Unnamed: 0,VARIABLE,LABEL
0,B03002_001E,Estimate!!Total
1,B03002_002E,Estimate!!Total!!Not Hispanic or Latino
2,B03002_003E,Estimate!!Total!!Not Hispanic or Latino!!White alone
3,B03002_004E,Estimate!!Total!!Not Hispanic or Latino!!Black or African American alone
4,B03002_005E,Estimate!!Total!!Not Hispanic or Latino!!American Indian and Alaska Native alone
5,B03002_006E,Estimate!!Total!!Not Hispanic or Latino!!Asian alone
6,B03002_007E,Estimate!!Total!!Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone
7,B03002_008E,Estimate!!Total!!Not Hispanic or Latino!!Some other race alone
8,B03002_009E,Estimate!!Total!!Not Hispanic or Latino!!Two or more races
9,B03002_010E,Estimate!!Total!!Not Hispanic or Latino!!Two or more races!!Two races including Some other race


In [36]:
validate_xyear('B03002_006E', 'B03002', 'acs/acs5', range(2009, 2022+1))

['estimate total not hispanic or latino asian alone']

In [37]:
validate_xyear('B03002_006E', 'B03002', 'acs/acs1', years_acs1_list)

['estimate total not hispanic or latino asian alone']