# Visualization of the U.S. Race and Ethnicity Distribution over the Years (2016 - 2060)


#### Mengfei Lan
#### IS445, Spring 2021

In [41]:
import pandas as pd
import numpy as np

import bqplot
import numpy as np
import ipywidgets

#### The dataset I am using is from United States Census Bureau ( https://www2.census.gov/programs-surveys/popproj/datasets/2017/2017-popproj/np2017_d1_mid.csv). The dataset is named as "The U.S National Dataset: Population by Sex, Race, and Hispanic Origin for the United States: 2016 to 2060". From this dataset, we can get to know the race, sex and hispanic origin population distribution in the United States from 2016 to 2060. The data after 2017 is predicted. We can also create the population pyramid based on the dataset. 

#### The data contained in the dataset is clean and standard. There is no need to do further data cleaning. 


In [92]:
dataset = pd.read_csv("https://www2.census.gov/programs-surveys/popproj/datasets/2017/2017-popproj/np2017_d1_mid.csv")

#### Let's have an overview of the dataset. 

In [93]:
dataset

Unnamed: 0,SEX,ORIGIN,RACE,YEAR,TOTAL_POP,POP_0,POP_1,POP_2,POP_3,POP_4,...,POP_91,POP_92,POP_93,POP_94,POP_95,POP_96,POP_97,POP_98,POP_99,POP_100
0,0,0,0,2016,323127513,3970145,3995008,3992154,3982074,3987656,...,449986,372625,300000,239313,186408,135797,94311,68972,44895,81896
1,0,0,0,2017,325511184,4054035,3982964,4008116,4003478,3992207,...,449945,382669,311525,246219,192531,146801,104540,70840,50486,83574
2,0,0,0,2018,327891911,4075563,4068172,3995888,4019345,4013649,...,462335,382993,320285,256011,198354,151848,113165,78659,51938,86221
3,0,0,0,2019,330268840,4095614,4089881,4082231,4006967,4029427,...,467488,393919,320884,263533,206526,156654,117240,85265,57778,87671
4,0,0,0,2020,332639102,4113164,4110117,4104058,4094281,4016919,...,464985,398712,330389,264318,212880,163348,121128,88491,62724,92064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4855,2,2,11,2056,350520,6293,6217,6146,6086,6022,...,651,571,477,410,318,273,200,159,118,214
4856,2,2,11,2057,355471,6378,6294,6219,6151,6091,...,638,586,508,413,355,271,228,166,126,227
4857,2,2,11,2058,360483,6467,6377,6296,6224,6156,...,667,572,521,442,357,299,226,187,133,247
4858,2,2,11,2059,365543,6566,6467,6379,6301,6229,...,698,598,511,453,381,302,249,187,153,269


#### ORIGIN : 0 = Total   1 = Not Hispanic   2 = Hispanic
#### YEAR: Year of projection (July 1, 2016 to July 1, 2060)
#### TOTAL_POP: Population total (all ages combined) in each year
#### (POP_0, POP_1, ...POP_99, POP_100): Population age x as of July 1 (columns for ages 0, 1, 2, ...98, 99, 100 or more years old)
#### RACE:
####       0 = All races (codes 1 through 6)
####       1 = White alone
####       2 = Black alone
####       3 = AIAN alone
####       4 = Asian alone
####       5 = NHPI alone
####       6 = Two or More Races
####       7 = White alone or in combination
####       8 = Black alone or in combination
####       9 = AIAN alone or in combination
####       10 = Asian alone or in combination
####       11 = NHPI alone or in combination
#### SEX : 0 = Both sexes  1 = Male  2 = Female
#### NOTE: Hispanic origin is considered an ethnicity, not a race. Hispanics may be of any race.
#### All data are integers. 


### Data Visualization 1: Projected Population by race over the years

#### The following interactive chart shows the population distribution of races over the years. A slider below helps to choose the year from 2016 to 2060, and a drop down list enables to select the origin ( hispanic or not) . With this interactive chart, we can easily get to know the race and origin distribution in different years. 


In [94]:
@ipywidgets.interact(x=ipywidgets.IntSlider(min=2016, max=2060, step=1, value=2016), origin = ["Both Hispanic Origins", "Non-Hispanic", "Hispanic"])
def position(x, origin):
    if origin == "Both Hispanic Origins":
        origin_data = 0
    elif origin == "Non-Hispanic":
        origin_data = 1
    elif origin == "Hispanic":
        origin_data = 2
    new = dataset[(dataset["YEAR"] == x) & (dataset["ORIGIN"] == origin_data)]
    new = new.groupby(['RACE'], as_index=False)['TOTAL_POP'].sum()
    new.plot.bar(x = "RACE", y = "TOTAL_POP", figsize=(15,5), stacked=True)

interactive(children=(IntSlider(value=2016, description='x', max=2060, min=2016), Dropdown(description='origin…

### Data Visualization 2: Change of Race-Sex Population over the years 

#### The following chart is a heatmap showing the race-sex population trending over the years.  



In [95]:
data = pd.pivot_table(dataset, values='TOTAL_POP', index=['SEX'],columns=['RACE'], aggfunc=np.sum)


In [96]:
mySelectedLabel = ipywidgets.Label()

In [97]:
linedata = dataset.groupby("YEAR")["TOTAL_POP"].sum()
xl = linedata.index  # Year Acquired
yl = linedata.values # Total Square Footage acquired that year
x_scl = bqplot.LinearScale()
y_scl = bqplot.LinearScale()


x_axl = bqplot.Axis(scale = x_scl, 
                    label = 'Year')
y_axl = bqplot.Axis(scale = y_scl, 
                    orientation = 'vertical', 
                    label='Total Population')

lines = bqplot.Lines(x = xl, y = yl, scales = {'x': x_scl, 'y': y_scl})

figl = bqplot.Figure(marks = [lines], axes = [x_axl, y_axl])

In [98]:
def on_selected(change):
    if len(change['owner'].selected) == 1:
        grid = change['owner'].selected[0]
        colname = data.columns[grid[1]]
        mySelectedLabel.value = 'Data Value = ' + str(grid) # set our label
        sex = grid[0]
        new_df = dataset.loc[(dataset["RACE"] == int(colname)) & (dataset["SEX"] == int(sex)) ].groupby("YEAR")["TOTAL_POP"].sum().reset_index()
        lines.x = new_df['YEAR']
        lines.y = new_df['TOTAL_POP']

In [99]:
col_sc = bqplot.ColorScale(scheme = "Reds")
x_sc = bqplot.OrdinalScale()
y_sc = bqplot.OrdinalScale()

c_ax = bqplot.ColorAxis(scale = col_sc, 
                        orientation = 'vertical', 
                        side = 'right')
x_ax = bqplot.Axis(scale = x_sc, label = "Race")
y_ax = bqplot.Axis(scale = y_sc, 
                   orientation = 'vertical',
                   label = "Sex")

heat_map = bqplot.GridHeatMap(color = data,
                              scales = {'color': col_sc,
                                        'row': y_sc,
                                        'column': x_sc},
                              interactions = {'click': 'select'}, # make interactive on click of each box
                              anchor_style = {'fill':'blue'}, # to make our selection blue
                              selected_style = {'opacity': 1.0}, # make 100% opaque if box is selected
                              unselected_style = {'opacity': 0.8}) # make a little see-through if not

heat_map.observe(on_selected, 'selected')

x_scl = bqplot.LinearScale()
y_scl = bqplot.LinearScale()

x_axl = bqplot.Axis(scale = x_scl, 
                    label = 'Year')
y_axl = bqplot.Axis(scale = y_scl, 
                    orientation = 'vertical', 
                    label='Total Population')


fig = bqplot.Figure(marks = [heat_map], 
                    axes = [c_ax, y_ax, x_ax])
lines = bqplot.Lines(x = lines.x, y = lines.y, scales = {'x': x_scl, 'y': y_scl})
fig_dur = bqplot.Figure(marks = [lines], axes = [x_axl, y_axl])
fig.layout.min_width = '500px' # feel free to change for your screen
fig_dur.layout.min_width = '500px'

figures = ipywidgets.HBox([fig, fig_dur])

In [100]:
figures

HBox(children=(Figure(axes=[ColorAxis(orientation='vertical', scale=ColorScale(scheme='Reds'), side='right'), …

#### "Race" is set as the x-aixs for the heat map. "Sex" is set as the y-axis for the heat map. By pressing a block in the left heat map, a line chart which shows the population trend for the next fourty years of a specific race-sex is presented on the right side. From the trend line, we can see that the population of all race-sex matches will continue growing for the next several years. The growth rate of some race-sex match population will be slower in 2030s, such as race 1 (white alone) sex 1 (male). Some other race-sex matches, such as race 6 (two or more races) sex 1 (male), will have a faster growth rate in the next few years. 

### Data Visualization 3: Population Pyramid by Race and Origin over the Years

#### The following map is a interactive population pyramid. Users can choose race, year and origin according to their needs. The package, plotly, is used to create population pyramids. 

In [101]:
import sys
!conda install --yes --prefix {sys.prefix} plotly

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.9.0
  latest version: 4.10.1

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



In [102]:
import plotly.graph_objects as gp

In [103]:
@ipywidgets.interact(race=ipywidgets.IntSlider(min=0, max=11, step=1), year = ipywidgets.IntSlider(min=2016, max=2060, step=1, value=2016), origin = ["Both Hispanic Origins", "Non-Hispanic", "Hispanic"])
def population_pyramid(race, year, origin):
    if origin == "Both Hispanic Origins":
        origin_data = 0
    elif origin == "Non-Hispanic":
        origin_data = 1
    elif origin == "Hispanic":
        origin_data = 2
    
    new_m = dataset[(dataset["RACE"] == race) & (dataset["YEAR"] == year) & (dataset["SEX"] == 1) &(dataset["ORIGIN"] == origin_data)]
    new_m = new_m.drop(['SEX', 'ORIGIN', 'RACE', 'YEAR', 'TOTAL_POP'], axis=1)
    new_f = dataset[(dataset["RACE"] == race) & (dataset["YEAR"] == year) & (dataset["SEX"] == 2) &(dataset["ORIGIN"] == origin_data)]
    new_f = new_f.drop(['SEX', 'ORIGIN', 'RACE', 'YEAR', 'TOTAL_POP'], axis=1)
    new_f = new_f * -1
    new_m = new_m.T
    new_m = new_m.iloc[:, 0]
    new_f = new_f.T
    new_f = new_f.iloc[:, 0]
    y_age = new_m.index.tolist()
    fig = gp.Figure()
    # Adding Male data to the figure
    fig.add_trace(gp.Bar(y= y_age, x = new_m, 
                         name = 'Male', 
                         orientation = 'h'))

    # Adding Female data to the figure
    fig.add_trace(gp.Bar(y = y_age, x = new_f,
                         name = 'Female', orientation = 'h'))
    fig.show()


interactive(children=(IntSlider(value=0, description='race', max=11), IntSlider(value=2016, description='year'…

#### The population pyramid is as above. The population of ages can be seen from the pyramid ( for eample, pop_40 represents people in the age of 40) . 