### 2021 Census Data (ACS 5-year)for Selected Variables - Baltimore City



In [1]:
# Dependencies
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import requests
from census import Census

import censusdata 
#import censusgeocode as cg
import cenpy
#from scipy.stats import linregress

# Census & gmaps API Keys
# the latest year available is the default year and so you do not need to specify the year

from config import (api_key, gkey)
c = Census(api_key)

# Configure gmaps
#gmaps.configure(api_key=gkey)

In [2]:
#import os

In [3]:
#os.getcwd()

In [4]:
pd.set_option('display.max_rows', 200)

In [5]:
pd.set_option('max_colwidth', 250)

In [6]:
# American Community Survey 5-Year Data 

data = censusdata.download('acs5', 2021,
                          censusdata.censusgeo([('state', '24'),
                                         ('county', '510')]),
                                         ["B01001_001E", 
                                          "B02001_002E",
                                          "B02001_003E",
                                          "B02001_005E",              
                                          "B02001_008E",
                                          "B03001_003E",
                                          "B05002_013E",
                                          "B08301_001E",               
                                          "B08301_010E",
                                          "B15003_001E",
                                          "B15003_016E",               
                                          "B15003_017E",
                                          "B15003_021E",               
                                          "B15003_022E" 
                                          ])
                          
census_df = pd.DataFrame(data)
census_df.head()

Unnamed: 0,B01001_001E,B02001_002E,B02001_003E,B02001_005E,B02001_008E,B03001_003E,B05002_013E,B08301_001E,B08301_010E,B15003_001E,B15003_016E,B15003_017E,B15003_021E,B15003_022E
"Baltimore city, Maryland: Summary level: 050, state:24> county:510",592211,173079,364879,14887,23091,33246,48142,271624,39122,414928,9620,95925,21425,71550


In [7]:
census_df = census_df.reset_index()

In [8]:
census_df = census_df.rename(columns={"index": "Geography"})
census_df.head()

Unnamed: 0,Geography,B01001_001E,B02001_002E,B02001_003E,B02001_005E,B02001_008E,B03001_003E,B05002_013E,B08301_001E,B08301_010E,B15003_001E,B15003_016E,B15003_017E,B15003_021E,B15003_022E
0,"Baltimore city, Maryland: Summary level: 050, state:24> county:510",592211,173079,364879,14887,23091,33246,48142,271624,39122,414928,9620,95925,21425,71550


In [9]:
census_df.shape

(1, 15)

In [10]:
census_df.dtypes

Geography      object
B01001_001E     int64
B02001_002E     int64
B02001_003E     int64
B02001_005E     int64
B02001_008E     int64
B03001_003E     int64
B05002_013E     int64
B08301_001E     int64
B08301_010E     int64
B15003_001E     int64
B15003_016E     int64
B15003_017E     int64
B15003_021E     int64
B15003_022E     int64
dtype: object

In [11]:
# remove commas from the text in "Geography" column so that I can complete the following step of removing extraneous text
census_df["Geography"]= census_df["Geography"].astype(str).str.replace(","," ")
census_df

Unnamed: 0,Geography,B01001_001E,B02001_002E,B02001_003E,B02001_005E,B02001_008E,B03001_003E,B05002_013E,B08301_001E,B08301_010E,B15003_001E,B15003_016E,B15003_017E,B15003_021E,B15003_022E
0,Baltimore city Maryland: Summary level: 050 state:24> county:510,592211,173079,364879,14887,23091,33246,48142,271624,39122,414928,9620,95925,21425,71550


In [12]:
# this didn't work until I did the step above of removing the commas from the text in the "Geography" column

# Slice to a specific ending point from the end of the string: You can use the str.slice(stop) method to slice a string column up to a specific stop index from the end of the string. 
# see https://medium.com/geekculture/how-do-you-use-slice-method-in-the-pandas-dataframe-on-string-data-type-columns-6a8fd02c15eb
# code below "slices"/ removes text from the 'Geography' column up to the 6th character from the END of the string
census_df["Geography"]= census_df["Geography"].str.slice(-3)
census_df

Unnamed: 0,Geography,B01001_001E,B02001_002E,B02001_003E,B02001_005E,B02001_008E,B03001_003E,B05002_013E,B08301_001E,B08301_010E,B15003_001E,B15003_016E,B15003_017E,B15003_021E,B15003_022E
0,510,592211,173079,364879,14887,23091,33246,48142,271624,39122,414928,9620,95925,21425,71550


In [13]:
# add census state code (24) and city code (510) to each value in the "Geography column" by concatenating 24510 with the tract #
# this way, we can more easily merge with dataframes from other census dataframes for commercial corridors
census_df["Geography"] = "24" + census_df["Geography"]
census_df.head()

Unnamed: 0,Geography,B01001_001E,B02001_002E,B02001_003E,B02001_005E,B02001_008E,B03001_003E,B05002_013E,B08301_001E,B08301_010E,B15003_001E,B15003_016E,B15003_017E,B15003_021E,B15003_022E
0,24510,592211,173079,364879,14887,23091,33246,48142,271624,39122,414928,9620,95925,21425,71550


In [14]:
census_df.columns

Index(['Geography', 'B01001_001E', 'B02001_002E', 'B02001_003E', 'B02001_005E',
       'B02001_008E', 'B03001_003E', 'B05002_013E', 'B08301_001E',
       'B08301_010E', 'B15003_001E', 'B15003_016E', 'B15003_017E',
       'B15003_021E', 'B15003_022E'],
      dtype='object')

In [15]:
# Create Geographic Identifier ("GEOID") for each census tract by adding state fips code + county fips code + census tract code
# see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html#:~:text=The%20full%20GEOID%20for%20many,codes%2C%20in%20which%20they%20nest.
#census_df["GEOID"] = census_df['state'] + census_df['county'] + census_df['tract']
#census_df

In [16]:
# Replace the census variable codes (such as "B19013_001E") in the dataframe with text so it's understandable
census_df = census_df.rename(columns={"B19001_001E": "Total households",
                                      "B01001_001E": "Population",
                                      "tract": "Census Tract",  
                                      "B08301_001E": "Workers_16_yrs_and_over",  
                                      "B08301_010E": "Commute_to_work_public_transportation",       
                                      
                                      "B02001_002E": "Pop. white",
                                      "B02001_003E": "Pop. Black",
                                      "B02001_005E": "Pop. Asian",        
                                      "B02001_008E": "Pop. two or more races",
                                      "B03001_003E": "Pop. Hispanic origin",
                                      "B05002_013E": "# Foreign-born",
                                      "B11002_001E": "Household population",
                                      "B15003_001E": "Population_25_yrs_and_over",
                                      "B15003_016E": "# persons 12th grade, no diploma",
                                      "B15003_017E": "# persons graduated high school",
                                      "B15003_021E": "# persons Associate's degree",
                                      "B15003_022E": "# persons Bachelor's degree"
                                      
                                     })       
census_df.head()                              



Unnamed: 0,Geography,Population,Pop. white,Pop. Black,Pop. Asian,Pop. two or more races,Pop. Hispanic origin,# Foreign-born,Workers_16_yrs_and_over,Commute_to_work_public_transportation,Population_25_yrs_and_over,"# persons 12th grade, no diploma",# persons graduated high school,# persons Associate's degree,# persons Bachelor's degree
0,24510,592211,173079,364879,14887,23091,33246,48142,271624,39122,414928,9620,95925,21425,71550


In [28]:
# Use .groupby and .agg to sum the amounts by corridor
corridors_sum = corridors_df.groupby(['Corridor'], as_index=False).agg(
    {"Population": sum,
     "Pop. white": sum,
     "Pop. Black": sum,
     "Pop. Asian": sum,
     "Pop. two or more races": sum,
     "Pop. Hispanic origin": sum,
     "# Foreign-born": sum,
     "Workers_16_yrs_and_over": sum,
     "Commute_to_work_public_transportation": sum,
     "Population_25_yrs_and_over": sum,
     "# persons 12th grade, no diploma": sum,
     "# persons graduated high school": sum,
     "# persons Associate's degree": sum,
     "# persons Bachelor's degree": sum
    }) 
corridors_sum

Unnamed: 0,Corridor,Population,Pop. white,Pop. Black,Pop. Asian,Pop. two or more races,Pop. Hispanic origin,# Foreign-born,Workers_16_yrs_and_over,Commute_to_work_public_transportation,Population_25_yrs_and_over,"# persons 12th grade, no diploma",# persons graduated high school,# persons Associate's degree,# persons Bachelor's degree


In [29]:
corridors_sum["Percent Black"] = 100 * \
    corridors_sum["Pop. Black"].astype(
        int) / corridors_sum["Population"].astype(
        int) 

corridors_sum["Percent Black"] = corridors_sum["Percent Black"].astype(float).round(1)

In [30]:
corridors_sum["Percent Hispanic origin"] = 100 * \
    corridors_sum["Pop. Hispanic origin"].astype(
        int) / corridors_sum["Population"].astype(
        int) 

corridors_sum["Percent Hispanic origin"] = corridors_sum["Percent Hispanic origin"].astype(float).round(1)

In [31]:
corridors_sum["Percent White"] = 100 * \
    corridors_sum["Pop. white"].astype(
        int) / corridors_sum["Population"].astype(
        int) 
corridors_sum["Percent White"] = corridors_sum["Percent White"].astype(float).round(1)


In [32]:
corridors_sum["Percent two or more races"] = 100 * \
    corridors_sum["Pop. two or more races"].astype(
        int) / corridors_sum["Population"].astype(
        int) 
corridors_sum["Percent two or more races"] = corridors_sum["Percent two or more races"].astype(float).round(1)


In [33]:
corridors_sum["Percent 12th grade, no diploma"] = 100 * \
    corridors_sum["# persons 12th grade, no diploma"].astype(
        int) / corridors_sum["Population_25_yrs_and_over"].astype(
        int) 
corridors_sum["Percent 12th grade, no diploma"] = corridors_sum["Percent 12th grade, no diploma"].astype(float).round(1)


In [34]:
corridors_sum["Percent graduated high school"] = 100 * \
    corridors_sum["# persons graduated high school"].astype(
        int) / corridors_sum["Population_25_yrs_and_over"].astype(
        int) 
corridors_sum["Percent graduated high school"] = corridors_sum["Percent graduated high school"].astype(float).round(1)


In [35]:
corridors_sum["Percent earned Associate's degree"] = 100 * \
    corridors_sum["# persons Associate's degree"].astype(
        int) / corridors_sum["Population_25_yrs_and_over"].astype(
        int) 
corridors_sum["Percent earned Associate's degree"] = corridors_sum["Percent earned Associate's degree"].astype(float).round(1)


In [36]:
corridors_sum["Percent earned Bachelor's degree"] = 100 * \
    corridors_sum["# persons Bachelor's degree"].astype(
        int) / corridors_sum["Population_25_yrs_and_over"].astype(
        int) 
corridors_sum["Percent earned Bachelor's degree"] = corridors_sum["Percent earned Bachelor's degree"].astype(float).round(1)


In [37]:
corridors_sum["Percent Foreign-born"] = 100 * \
    corridors_sum["# Foreign-born"].astype(
        int) / corridors_sum["Population"].astype(
        int) 
corridors_sum["Percent Foreign-born"] = corridors_sum["Percent Foreign-born"].astype(float).round(1)
corridors_sum

Unnamed: 0,Corridor,Population,Pop. white,Pop. Black,Pop. Asian,Pop. two or more races,Pop. Hispanic origin,# Foreign-born,Workers_16_yrs_and_over,Commute_to_work_public_transportation,...,# persons Bachelor's degree,Percent Black,Percent Hispanic origin,Percent White,Percent two or more races,"Percent 12th grade, no diploma",Percent graduated high school,Percent earned Associate's degree,Percent earned Bachelor's degree,Percent Foreign-born


In [38]:
corridors2021 = corridors_sum.copy()
corridors2021

Unnamed: 0,Corridor,Population,Pop. white,Pop. Black,Pop. Asian,Pop. two or more races,Pop. Hispanic origin,# Foreign-born,Workers_16_yrs_and_over,Commute_to_work_public_transportation,...,# persons Bachelor's degree,Percent Black,Percent Hispanic origin,Percent White,Percent two or more races,"Percent 12th grade, no diploma",Percent graduated high school,Percent earned Associate's degree,Percent earned Bachelor's degree,Percent Foreign-born


In [None]:
# create bar chart using plotly
Black = corridors2021.groupby(['Corridor'])['Percent Black'].sum().reset_index()
Black.columns = ['Corridor','Percent Black']
top_Black = Black.sort_values(by = ['Percent Black'],ascending = False)[:10]


fig = px.bar(top_Black, x = 'Corridor', y="Percent Black")
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
'title': 'Corridors with highest percentage Black residents'})

fig.show()

In [None]:
# how to use Dash to share your plotly visualizations: https://towardsdatascience.com/dash-for-beginners-create-interactive-python-dashboards-338bfcb6ffa4

In [None]:
# Export df as an Excel file
#corridors_2021_final.to_excel("CommCorr_demographics_2021.xlsx", index = False)

In [None]:
# import csv file of Baltimore City (as a whole) demographics
# first save the data from the csv in variable, "baltcity_2021" 

#baltcity_2021 = "BaltCity_demographics_2021.csv"

In [None]:
# Create dataframes by reading the variable data from the code block above
#balt_df2 = pd.read_csv(baltcity_2021)
#balt_df2

In [None]:
#balt_df = balt_df2.reset_index(drop=True)

In [None]:
# merge the dataframes on the common column "Corridor"
#corridors_balt_merge = pd.concat([corridors_2021_final, balt_df], axis=0)
#corridors_balt_merge = corridors_balt_merge.reset_index(drop=True)
#corridors_balt_merge.columns

In [None]:
# import csv file of Maryland demographics
# first save the data from the csv in variable, "MD_2021" 

#MD_2021 = "MD_demographics_2021.csv"

In [None]:
# Create dataframe by reading the variable data from the code block above
#MD_df = pd.read_csv(MD_2021)
#MD_df

In [None]:
#MD_df = MD_df.reset_index(drop=True)

In [None]:
# merge the dataframes on the common column "Corridor"
#Demog = pd.concat([corridors_balt_merge, MD_df], axis=0)
#Demog = Demog.reset_index(drop=True)
#Demog

In [None]:
# Change names of corridors (replace string using apply() function with lambda)
#Demographics_Master = Demog.apply(lambda x: x.replace({"Baltimore city, Maryland":"Baltimore City", "Penn Ave": "Pennsylvania Avenue",
#                                                       "Belair Rd": "Belair Road", "North Ave": "North Avenue"}, regex=True))
#Demographics_Master                   

In [None]:
#Demographics_Master.to_excel("CommCorr_Tableau-2021/CommCorr_Tableau_2021rev.xlsx")