In [173]:
import pandas as pd

In [166]:
from pandas_profiling import ProfileReport

In [167]:
df = pd.read_csv('recent-grads.csv')
df

Unnamed: 0,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,Employed,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,1,2419,PETROLEUM ENGINEERING,2339.0,2057.0,282.0,Engineering,0.120564,36,1976,...,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,2,2416,MINING AND MINERAL ENGINEERING,756.0,679.0,77.0,Engineering,0.101852,7,640,...,170,388,85,0.117241,75000,55000,90000,350,257,50
2,3,2415,METALLURGICAL ENGINEERING,856.0,725.0,131.0,Engineering,0.153037,3,648,...,133,340,16,0.024096,73000,50000,105000,456,176,0
3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,1258.0,1123.0,135.0,Engineering,0.107313,16,758,...,150,692,40,0.050125,70000,43000,80000,529,102,0
4,5,2405,CHEMICAL ENGINEERING,32260.0,21239.0,11021.0,Engineering,0.341631,289,25694,...,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,169,3609,ZOOLOGY,8409.0,3050.0,5359.0,Biology & Life Science,0.637293,47,6259,...,2190,3602,304,0.046320,26000,20000,39000,2771,2947,743
169,170,5201,EDUCATIONAL PSYCHOLOGY,2854.0,522.0,2332.0,Psychology & Social Work,0.817099,7,2125,...,572,1211,148,0.065112,25000,24000,34000,1488,615,82
170,171,5202,CLINICAL PSYCHOLOGY,2838.0,568.0,2270.0,Psychology & Social Work,0.799859,13,2101,...,648,1293,368,0.149048,25000,25000,40000,986,870,622
171,172,5203,COUNSELING PSYCHOLOGY,4626.0,931.0,3695.0,Psychology & Social Work,0.798746,21,3777,...,965,2738,214,0.053621,23400,19200,26000,2403,1245,308


In [168]:
ProfileReport(df).to_file(output_file='profile-recent-grads.html')

Summarize dataset:   0%|          | 0/35 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# How many majors are in each major category?

#### Using the groupby() and count() methods, we can apply these to our dataframe to output how many majors have the corresponding major category per category across the entire dataframe. We can also sort this information to list in descending order by category size, having the largest category at the top and the smallest at the bottom. 

In [169]:
df.groupby('Major_category').Major_category.count().sort_values(ascending = False)

Major_category
Engineering                            29
Education                              16
Humanities & Liberal Arts              15
Biology & Life Science                 14
Business                               13
Health                                 12
Computers & Mathematics                11
Agriculture & Natural Resources        10
Physical Sciences                      10
Psychology & Social Work                9
Social Science                          9
Arts                                    8
Industrial Arts & Consumer Services     7
Law & Public Policy                     5
Communications & Journalism             4
Interdisciplinary                       1
Name: Major_category, dtype: int64

# Which majors are most dominated by men?

#### Using the sort_values(by= '') method, I first sorted the majors based on the share of women in ascending order, meaning that the majors with the smallest share of women (or the majors with majority men) would be at the top. Next, I calculated the share of men per major by subtracting the share of women from one. To do this, I needed to first sort the list by ShareWomen in ascending order so the order would match the list of majors, and then I selected the values of ShareWomen. Since the new value represented the share of men rather than women, I also needed to rename the column. Finally, I took the two series and joined them together to create a smaller dataframe of information. 

In [219]:
men_majors = df.sort_values(by = 'ShareWomen', ascending = True).Major
share_men_per_major = 1 - df.sort_values(by = 'ShareWomen', ascending = True).ShareWomen.rename('ShareMen')
men_majors.to_frame().join(share_men_per_major)

Unnamed: 0,Major,ShareMen
73,MILITARY TECHNOLOGIES,1.000000
66,MECHANICAL ENGINEERING RELATED TECHNOLOGIES,0.922547
26,CONSTRUCTION SERVICES,0.909287
1,MINING AND MINERAL ENGINEERING,0.898148
3,NAVAL ARCHITECTURE AND MARINE ENGINEERING,0.892687
...,...,...
138,ELEMENTARY EDUCATION,0.076255
51,MEDICAL ASSISTING SERVICES,0.072193
163,COMMUNICATION DISORDERS SCIENCES AND SERVICES,0.032002
164,EARLY CHILDHOOD EDUCATION,0.031046


# Which majors in the Computers & Mathematics major category have the highest median salary?

#### I selected all the rows that have a Major_category of 'Computers & Mathematics' using the loc operator. I then sorted those rows by descending median salary using sort_values(by='Median', ascending = False). Next, I took the top 10 of those majors using the head() method. Finally, I outputted that information by using loc to select all of the rows from the calculated data set and output only the Major and corresponding median salary. 

In [175]:
top_median_comp_and_math = df.loc[df.Major_category == 'Computers & Mathematics'].sort_values(by='Median', ascending = False).head(10)
top_median_comp_and_math.loc[:, ['Major', 'Median']]


Unnamed: 0,Major,Median
20,COMPUTER SCIENCE,53000
41,MATHEMATICS,45000
42,COMPUTER AND INFORMATION SYSTEMS,45000
45,INFORMATION SCIENCES,45000
46,STATISTICS AND DECISION SCIENCE,45000
47,APPLIED MATHEMATICS,45000
52,MATHEMATICS AND COMPUTER SCIENCE,42000
53,COMPUTER PROGRAMMING AND DATA PROCESSING,41300
81,COMPUTER ADMINISTRATION MANAGEMENT AND SECURITY,37500
84,COMPUTER NETWORKING AND TELECOMMUNICATIONS,36400


# What is the most popular Computers & Mathematics major?

#### Similarly to the previous question, I began by selecting all the rows that have a major category of 'Computers & Mathematics.' Next, I sorted those majors according to total number of people enrolled in the major (popularity). I did this in descending order using the (ascending = False) condition to have the major with the most number of students enrolled at the top of my list. I then chose the top 10 rows of this data set using the head() method. Finally, I selected all of those rows with only the columns 'Major' and 'Total' to output the information.

In [176]:
top_10_comp_and_math_majors = df.loc[df.Major_category == 'Computers & Mathematics'].sort_values(by='Total', ascending = False).head(10)
top_10_comp_and_math_majors.loc[:, ['Major', 'Total']]


Unnamed: 0,Major,Total
20,COMPUTER SCIENCE,128319.0
41,MATHEMATICS,72397.0
42,COMPUTER AND INFORMATION SYSTEMS,36698.0
105,COMMUNICATION TECHNOLOGIES,18035.0
45,INFORMATION SCIENCES,11913.0
81,COMPUTER ADMINISTRATION MANAGEMENT AND SECURITY,8066.0
84,COMPUTER NETWORKING AND TELECOMMUNICATIONS,7613.0
46,STATISTICS AND DECISION SCIENCE,6251.0
47,APPLIED MATHEMATICS,4939.0
53,COMPUTER PROGRAMMING AND DATA PROCESSING,4168.0


# Which majors have the highest correlation with a college job in the 75th percentile of earnings?

#### Explanation of Code

In [217]:
df_corr = df.corr()

def max_corr(item):
    max_corr = df_corr[item].drop(item).idxmax()
    return max_corr

