# US student  graduate student employment analysis-summarizing data

## Data preview

In [11]:
import pandas as pd
import numpy as np

# read the data
all_ages = pd.read_csv('all-ages.csv')
recent_grads = pd.read_csv('recent-grads.csv')

# display the schema for each table
print('Employment data by major for all ages\n',all_ages.columns.tolist())
print('Employment data by major for recent college graduates only\n',recent_grads.columns.tolist())

Employment data by major for all ages
 ['Major_code', 'Major', 'Major_category', 'Total', 'Employed', 'Employed_full_time_year_round', 'Unemployed', 'Unemployment_rate', 'Median', 'P25th', 'P75th']
Employment data by major for recent college graduates only
 ['Rank', 'Major_code', 'Major', 'Total', 'Men', 'Women', 'Major_category', 'ShareWomen', 'Sample_size', 'Employed', 'Full_time', 'Part_time', 'Full_time_year_round', 'Unemployed', 'Unemployment_rate', 'Median', 'P25th', 'P75th', 'College_jobs', 'Non_college_jobs', 'Low_wage_jobs']


## Mission 1: inspect student number grouping by major

In [9]:
# one of intuitive way to sum all the student data grouping by major
aa_cat_counts = pd.pivot_table(all_ages,index = 'Major_category',values = 'Total', aggfunc=np.sum).to_dict()
rg_cat_counts = pd.pivot_table(recent_grads,index = 'Major_category',values = 'Total', aggfunc=np.sum).to_dict()

# another solution: this one uses 
aa_cat_counts = dict()
rg_cat_counts = dict()
Major_category_a = all_ages['Major_category'].unique()
Major_category_r = recent_grads['Major_category'].unique()
for m in Major_category_a:
    c = all_ages['Total'][all_ages['Major_category']==m].sum()
    aa_cat_counts[m] = c
for m in Major_category_r:
    c = recent_grads['Total'][recent_grads['Major_category']==m].sum()
    rg_cat_counts[m] = int(c)

# reader-friendly display
print(pd.DataFrame(aa_cat_counts,index = [0]).iloc[0])
print(pd.DataFrame(rg_cat_counts,index = [0]).iloc[0])

Agriculture & Natural Resources         632437
Arts                                   1805865
Biology & Life Science                 1338186
Business                               9858741
Communications & Journalism            1803822
Computers & Mathematics                1781378
Education                              4700118
Engineering                            3576013
Health                                 2950859
Humanities & Liberal Arts              3738335
Industrial Arts & Consumer Services    1033798
Interdisciplinary                        45199
Law & Public Policy                     902926
Physical Sciences                      1025318
Psychology & Social Work               1987278
Social Science                         2654125
Name: 0, dtype: int64
Agriculture & Natural Resources          75620
Arts                                    357130
Biology & Life Science                  453862
Business                               1302376
Communications & Journalism           

## Mission 2: test the claim that graduates are 'likely' to get low-pay jobs

In [17]:
low_wage_proportion = recent_grads['Low_wage_jobs'].sum()/recent_grads['Total'].sum()
print('low wage percentage:',str(low_wage_proportion * 100)+'%')

low wage percentage: 9.858891195563151%


The data shows merely 10% of graduates get low-pay jobs. 
<br>Let's also comparing the recent graduate unemployment data and all ages data

In [20]:
majors = recent_grads['Major'].unique()
rg_lower_count = 0

# one solution: the sum is only used as a shorcut to get a number
for i,item in enumerate(majors):
    sum_r = recent_grads['Unemployment_rate'][recent_grads['Major']==item].sum()
    sum_a = all_ages['Unemployment_rate'][all_ages['Major']==item].sum()
    if sum_r<sum_a:
        rg_lower_count+=1
print(rg_lower_count,'out of',len(recent_grads['Major']))

# another solution: standard way to get a number from the Dataframe
rg_lower_count =0
for m in majors:
    recent_grads_row = recent_grads[recent_grads['Major'] == m]
    all_ages_row = all_ages[all_ages['Major'] == m]
    
    # The reason not using loc[] is that the first index is dynamic as the major changes.
    rg_unemp_rate = recent_grads_row.iloc[0]['Unemployment_rate']
    aa_unemp_rate = all_ages_row.iloc[0]['Unemployment_rate']
    
    if rg_unemp_rate < aa_unemp_rate:
        rg_lower_count += 1
print(rg_lower_count,'out of',len(recent_grads['Major']))

44 out of 173
44 out of 173


There are 44 out of 173 majors that graduate gets lower unemployment rate, which indicates that graduates excel in the 1/4 of majors. 