In [455]:
import pandas as pd

In [456]:
df = pd.read_csv("salaries_by_college_major.csv")
df.head()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
0,Accounting,46000.0,77100.0,42200.0,152000.0,Business
1,Aerospace Engineering,57700.0,101000.0,64300.0,161000.0,STEM
2,Agriculture,42600.0,71900.0,36300.0,150000.0,Business
3,Anthropology,36800.0,61500.0,33800.0,138000.0,HASS
4,Architecture,41600.0,76800.0,50600.0,136000.0,Business


In [457]:
# (columns, rows)
df.shape

(51, 6)

In [458]:
df.columns

Index(['Undergraduate Major', 'Starting Median Salary',
       'Mid-Career Median Salary', 'Mid-Career 10th Percentile Salary',
       'Mid-Career 90th Percentile Salary', 'Group'],
      dtype='object')

In [459]:
# use isna to check if there is NaN value
clear_df = df.dropna()
clear_df.tail()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
45,Political Science,40800.0,78200.0,41200.0,168000.0,HASS
46,Psychology,35900.0,60400.0,31600.0,127000.0,HASS
47,Religion,34100.0,52000.0,29700.0,96400.0,HASS
48,Sociology,36500.0,58200.0,30700.0,118000.0,HASS
49,Spanish,34000.0,53100.0,31000.0,96400.0,HASS


#### Find College Major with Highest Starting Salaries

In [460]:
clear_df["Starting Median Salary"].max()

74300.0

In [461]:
# .loc returns the whole row, .idmax returns the row index of the highest value
clear_df.loc[clear_df["Starting Median Salary"].idxmax()]

Undergraduate Major                  Physician Assistant
Starting Median Salary                           74300.0
Mid-Career Median Salary                         91700.0
Mid-Career 10th Percentile Salary                66400.0
Mid-Career 90th Percentile Salary               124000.0
Group                                               STEM
Name: 43, dtype: object

#### Find College Major with Highest Mid-Career Salaries

In [462]:
clear_df["Undergraduate Major"][clear_df["Mid-Career Median Salary"].idxmax()]

'Chemical Engineering'

#### Find College Major with Lowest Mid-Career Salaries

In [463]:
clear_df["Undergraduate Major"][clear_df["Starting Median Salary"].idxmin()]

'Spanish'

In [464]:
clear_df["Starting Median Salary"].min()

34000.0

#### Majors with the Most Potential vs Lowest Risk

A low-risk major is a degree where there is a small difference between the lowest and highest salaries.

In other words, if the difference between the 10th percentile and the 90th percentile earnings of your major is small, then you can be more certain about your salary after you graduate.

In [465]:
spread_column = clear_df["Mid-Career 90th Percentile Salary"] - clear_df["Mid-Career 10th Percentile Salary"]
clear_df.insert(5, "Spread", spread_column)
clear_df.head()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Spread,Group
0,Accounting,46000.0,77100.0,42200.0,152000.0,109800.0,Business
1,Aerospace Engineering,57700.0,101000.0,64300.0,161000.0,96700.0,STEM
2,Agriculture,42600.0,71900.0,36300.0,150000.0,113700.0,Business
3,Anthropology,36800.0,61500.0,33800.0,138000.0,104200.0,HASS
4,Architecture,41600.0,76800.0,50600.0,136000.0,85400.0,Business


#### Sorting by the Lowest Spread

To see which degrees have the smallest spread, we can use the .sort_values() method.

In [466]:
low_risk = clear_df.sort_values("Spread", ascending=True)
low_risk[["Undergraduate Major", "Spread"]].head()

Unnamed: 0,Undergraduate Major,Spread
40,Nursing,50700.0
43,Physician Assistant,57600.0
41,Nutrition,65300.0
49,Spanish,65400.0
27,Health Care Administration,66400.0


In [467]:
# Degrees with the highest potential
largest_five = clear_df.sort_values("Mid-Career 90th Percentile Salary", ascending=False)
largest_five[["Undergraduate Major", "Mid-Career 90th Percentile Salary"]].head(5)

Unnamed: 0,Undergraduate Major,Mid-Career 90th Percentile Salary
17,Economics,210000.0
22,Finance,195000.0
8,Chemical Engineering,194000.0
37,Math,183000.0
44,Physics,178000.0


In [468]:
# Grouping
clear_df.groupby("Group").count()

Unnamed: 0_level_0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Spread
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Business,12,12,12,12,12,12
HASS,22,22,22,22,22,22
STEM,16,16,16,16,16,16


In [469]:
# Average salary per group
clear_df.groupby("Group")[["Starting Median Salary", "Mid-Career Median Salary"]].mean()

Unnamed: 0_level_0,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business,44633.333333,75083.333333,43566.666667,147525.0
HASS,37186.363636,62968.181818,34145.454545,129363.636364
STEM,53862.5,90812.5,56025.0,157625.0
