In [100]:
import pandas as pd
import numpy as np

## 1. Import and clean employment data

I downloaded 6 datasets (2015-2020) each for total workforce numbers in the US by gender and median earnings by sex from the US Census website. Link: https://data.census.gov/cedsci/table?q=b241%20&tid=ACSDT5Y2018.B24116

In [102]:
# import workforce and median pay datasets

total_2020 = pd.read_csv('total_2020.csv')
total_2019 = pd.read_csv('total_2019.csv')
total_2018 = pd.read_csv('total_2018.csv')
total_2017 = pd.read_csv('total_2017.csv')
total_2016 = pd.read_csv('total_2016.csv')
total_2015 = pd.read_csv('total_2015.csv')

median_2020 = pd.read_csv('median_2020.csv')
median_2019 = pd.read_csv('median_2019.csv')
median_2018 = pd.read_csv('median_2018.csv')
median_2017 = pd.read_csv('median_2017.csv')
median_2016 = pd.read_csv('median_2016.csv')
median_2015 = pd.read_csv('median_2015.csv')

In [105]:
# To check the shape and head of the data I only use 1 table as they are all the same

print(median_2020.shape)

median_2020.head()

(36, 4)


Unnamed: 0,Label (Grouping),United States!!Median earnings (dollars) for male!!Estimate,United States!!Median earnings (dollars) for female!!Estimate,United States!!Women's earnings as a percentage of men's earning!!Estimate
0,Civilian employed population 16 years and over...,46020,33108,71.9%
1,"Management, business, science, and arts oc...",75473,52221,69.2%
2,"Management, business, and financial oc...",80974,59748,73.8%
3,Management occupations,83297,61298,73.6%
4,Business and financial operations ...,75819,57439,75.8%


In [106]:
# Check shape and head of total workforce data

print(total_2020.shape)

total_2020.head()

(36, 5)


Unnamed: 0,Label (Grouping),United States!!Male!!Estimate,United States!!Percent Male!!Estimate,United States!!Female!!Estimate,United States!!Percent Female!!Estimate
0,Civilian employed population 16 years and over,81715497,52.4%,74173483,47.6%
1,"Management, business, science, and arts oc...",29118390,47.3%,32408516,52.7%
2,"Management, business, and financial oc...",13616574,54.5%,11366116,45.5%
3,Management occupations,9677998,59.1%,6695759,40.9%
4,Business and financial operations ...,3938576,45.7%,4670357,54.3%






The two sets of data (total workforce and median earnings) are the same length and have 36 occupations listed in the same order. We are only interested in the STEM occupations.




In [109]:
# subset df's to get df's containing only the STEM related data

total_2020 = total_2020.loc[5:8]
total_2019 = total_2019.loc[5:8]
total_2018 = total_2018.loc[5:8]
total_2017 = total_2017.loc[5:8]
total_2016 = total_2016.loc[5:8]
total_2015 = total_2015.loc[5:8]

median_2020 = median_2020.loc[5:8]
median_2019 = median_2019.loc[5:8]
median_2018 = median_2018.loc[5:8]
median_2017 = median_2017.loc[5:8]
median_2016 = median_2016.loc[5:8]
median_2015 = median_2015.loc[5:8]



Each dataset corresponds to a year so will insert a new column in each df and fill the column with year of the dataset


In [110]:
# insert year column

total_2020.insert(0, "Year", pd.Series([2020,2020,2020,2020], index=[5,6,7,8]))
total_2019.insert(0, "Year", pd.Series([2019,2019,2019,2019], index=[5,6,7,8]))
total_2018.insert(0, "Year", pd.Series([2018,2018,2018,2018], index=[5,6,7,8]))
total_2017.insert(0, "Year", pd.Series([2017,2017,2017,2017], index=[5,6,7,8]))
total_2016.insert(0, "Year", pd.Series([2016,2016,2016,2016], index=[5,6,7,8]))
total_2015.insert(0, "Year", pd.Series([2015,2015,2015,2015], index=[5,6,7,8]))

median_2020.insert(0, "Year", pd.Series([2020,2020,2020,2020], index=[5,6,7,8]))
median_2019.insert(0, "Year", pd.Series([2019,2019,2019,2019], index=[5,6,7,8]))
median_2018.insert(0, "Year", pd.Series([2018,2018,2018,2018], index=[5,6,7,8]))
median_2017.insert(0, "Year", pd.Series([2017,2017,2017,2017], index=[5,6,7,8]))
median_2016.insert(0, "Year", pd.Series([2016,2016,2016,2016], index=[5,6,7,8]))
median_2015.insert(0, "Year", pd.Series([2015,2015,2015,2015], index=[5,6,7,8]))




The next set of code blocks will merge the datasets into one Dataframe

In [111]:
# concat totals dfs
frames = [total_2020, total_2019, total_2018, total_2017, total_2016, total_2015]

totals = pd.concat(frames)

In [8]:
# concat median dfs
frames = [median_2020, median_2019, median_2018, median_2017, median_2016, median_2015]

medians = pd.concat(frames)

In [112]:
# merge medians and totals
employment = pd.merge(totals, medians, how="left", on=["Year", "Label (Grouping)"])

In [113]:
# print column names
employment.columns

Index(['Year', 'Label (Grouping)', 'United States!!Male!!Estimate',
       'United States!!Percent Male!!Estimate',
       'United States!!Female!!Estimate',
       'United States!!Percent Female!!Estimate',
       'United States!!Median earnings (dollars) for male!!Estimate',
       'United States!!Median earnings (dollars) for female!!Estimate',
       'United States!!Women's earnings as a percentage of men's earning!!Estimate'],
      dtype='object')






The column names are long and unclear so will rename to make them more manageable

In [114]:
# rename columns and store in new df

employment_clean = employment.rename(columns={"Label (Grouping)": "Field",'United States!!Male!!Estimate': 'Male employed total',
                                        'United States!!Percent Male!!Estimate': 'Male employed percent', 'United States!!Female!!Estimate':'Female employed total',
                                        'United States!!Percent Female!!Estimate':'Female employed percent',
                                        'United States!!Median earnings (dollars) for male!!Estimate': 'Male median earnings',
                                        'United States!!Median earnings (dollars) for female!!Estimate':'Female median earnings',
                                        "United States!!Women's earnings as a percentage of men's earning!!Estimate":"Women's earnings as % of men's"})

In [115]:
print(employment_clean.shape)
employment_clean

(24, 9)


Unnamed: 0,Year,Field,Male employed total,Male employed percent,Female employed total,Female employed percent,Male median earnings,Female median earnings,Women's earnings as % of men's
0,2020,"Computer, engineering, and science occ...",7153628,73.7%,2556942,26.3%,83947,67458,80.4%
1,2020,Computer and mathematical occupations,3746788,73.9%,1326042,26.1%,86511,72285,83.6%
2,2020,Architecture and engineering occup...,2594744,84.0%,493864,16.0%,84637,71127,84.0%
3,2020,"Life, physical, and social science...",812096,52.4%,737036,47.6%,66958,57517,85.9%
4,2019,"Computer, engineering, and science occ...",6848123,73.9%,2420403,26.1%,82433,65595,79.6%
5,2019,Computer and mathematical occupations,3577083,73.9%,1264236,26.1%,85264,70853,83.1%
6,2019,Architecture and engineering occup...,2486105,84.4%,460457,15.6%,83334,69487,83.4%
7,2019,"Life, physical, and social science...",784935,53.0%,695710,47.0%,65688,55510,84.5%
8,2018,"Computer, engineering, and science occ...",6570508,74.1%,2301695,25.9%,84916,71530,84.2%
9,2018,Computer and mathematical occupations,3426408,73.9%,1207452,26.1%,87431,74720,85.5%


In [116]:
# check datatypes
employment_clean.dtypes

Year                               int64
Field                             object
Male employed total               object
Male employed percent             object
Female employed total             object
Female employed percent           object
Male median earnings              object
Female median earnings            object
Women's earnings as % of men's    object
dtype: object

The datatypes for the numeric columns are incorrectly stored as object. This is due to the fact that there are seperators (,) and % stored in cells that conatin numbers. I will remove these and then set the new data types

In [117]:
# Remove strings such as , and % from df

employment_clean[['Male employed total','Female employed total',
                  'Male median earnings', 'Female median earnings']] = employment_clean[['Male employed total','Female employed total',
                  'Male median earnings', 'Female median earnings']].replace(to_replace=r'(,)', value='', regex=True)

employment_clean[['Male employed percent','Female employed percent',
                 "Women's earnings as % of men's"]] =\
                  employment_clean[['Male employed percent','Female employed percent',
                                    "Women's earnings as % of men's"]].replace(to_replace=r'(%)', value='', regex=True)


In [118]:
# convert datatypes 
convert_dict = {'Male employed total': float,
                'Male employed percent': float,
                'Female employed total': float,
                'Female employed percent': float,
                'Male median earnings': int,
                'Female median earnings': int,
                "Women's earnings as % of men's": float
                }
 
employment_clean = employment_clean.astype(convert_dict)

# check dtypes
print(employment_clean.dtypes)

Year                                int64
Field                              object
Male employed total               float64
Male employed percent             float64
Female employed total             float64
Female employed percent           float64
Male median earnings                int64
Female median earnings              int64
Women's earnings as % of men's    float64
dtype: object


In [33]:
# export cleaned data frame

employment_clean.to_excel('US_STEM_Employment_2015_2020.xlsx', sheet_name='sheet1', index=False)



## 2. Import and clean degree data

The US Census website contains data for degrees awarded by gender and field of study. The data is collected in a survey. link: https://data.census.gov/cedsci/table?q=enrolment%20degree&t=Education

The cleaning process will follow the same steps as for the workforce data


In [57]:
# import degree data 2015-2020

degree_2020 = pd.read_csv('degree_2020.csv')
degree_2019 = pd.read_csv('degree_2019.csv')
degree_2018 = pd.read_csv('degree_2018.csv')
degree_2017 = pd.read_csv('degree_2017.csv')
degree_2016 = pd.read_csv('degree_2016.csv')
degree_2015 = pd.read_csv('degree_2015.csv')

In [58]:
# subset dfs to extract needed columns

degree_2020 = degree_2020.loc[1:2]
degree_2019 = degree_2019.loc[1:2]
degree_2018 = degree_2018.loc[1:2]
degree_2017 = degree_2017.loc[1:2]
degree_2016 = degree_2016.loc[1:2]
degree_2015 = degree_2015.loc[1:2]

In [59]:
# insert year column

degree_2020.insert(0, "Year", pd.Series([2020,2020], index=[1,2]))
degree_2019.insert(0, "Year", pd.Series([2019,2019], index=[1,2]))
degree_2018.insert(0, "Year", pd.Series([2018,2018], index=[1,2]))
degree_2017.insert(0, "Year", pd.Series([2017,2017], index=[1,2]))
degree_2016.insert(0, "Year", pd.Series([2016,2016], index=[1,2]))
degree_2015.insert(0, "Year", pd.Series([2015,2015], index=[1,2]))

In [None]:
# Merge df's

degree_20_2017 = pd.concat([degree_2020, degree_2019, degree_2018, degree_2017])

degree_15_16 = pd.concat([degree_2016, degree_2015])

In [67]:
# Export dfs to further clean wrangle in excel

degree_20_2017.to_excel('degree_clean_20-17.xlsx', sheet_name='sheet1', index=False)
degree_15_16.to_excel('degree_clean_16_15.xlsx', sheet_name='sheet1', index=False)

In [121]:
#import cleaned and merged df
Degree_clean = pd.read_excel('degree_clean.xlsx')

In [125]:
#check data types

Degree_clean.dtypes

Year                 int64
Field of Degree     object
Male Total           int64
Male Percent       float64
Female Total         int64
Female Percent     float64
dtype: object

In [126]:
print(Degree_clean.shape)
Degree_clean

(12, 6)


Unnamed: 0,Year,Field of Degree,Male Total,Male Percent,Female Total,Female Percent
0,2020,Science and Engineering,15307579,0.441,10535634,0.272
1,2020,Science and Engineering Related Fields,1987903,0.057,5102639,0.132
2,2019,Science and Engineering,14846326,0.441,10052918,0.27
3,2019,Science and Engineering Related Fields,1913361,0.057,4868815,0.131
4,2018,Science and Engineering,14440302,0.44,9637921,0.267
5,2018,Science and Engineering Related Fields,1849503,0.056,4657264,0.129
6,2017,Science and Engineering,14024494,0.439,9273481,0.266
7,2017,Science and Engineering Related Fields,1797426,0.056,4452275,0.128
8,2016,Science and Engineering,14290685,0.44,9514192,0.266
9,2016,Science and Engineering Related Fields,1844637,0.057,4643226,0.13
