# **Importing All the necessary Packages**

---



In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

# **Scrapping the Website Using BeautifulSoup**

---



In [3]:
records = []

first_page_endpoint = "https://www.payscale.com/college-salary-report/majors-that-pay-you-back/bachelors"
next_page_endpoint = "https://www.payscale.com/college-salary-report/majors-that-pay-you-back/bachelors/page/"
response = requests.get(first_page_endpoint)
response.raise_for_status()
website_data = response.text

soup = BeautifulSoup(website_data, "html.parser")
# print(soup.prettify())


# ############################# Finding the Total Page Numbers ############################## #
inner_buttons = soup.find_all("div", {"class": "pagination__btn--inner"})
page_numbers = [inner_button.getText() for inner_button in inner_buttons if inner_button.getText().isnumeric()]
total_pages = int(max(page_numbers))
# print(page_numbers)
# print(total_pages)

# ############################ Getting Data From Table ####################################### #
for current_page in range(total_pages):
    if current_page == 0:
        data = website_data
    else:
        response = requests.get(next_page_endpoint+str(current_page + 1))
        response.raise_for_status()
        data = response.text

    soup = BeautifulSoup(data, "html.parser")

    rows = soup.select("table.data-table tbody tr")
    for row in rows:
        cells = row.select("span.data-table__value")
        record = {
            "Major": cells[1].getText(),
            "Early Career Pay": float(cells[3].getText().strip("$").replace(",", "")),
            "Mid-Career Pay": float(cells[4].getText().strip("$").replace(",", "")),
            "% High Meaning": cells[5].getText(),
        }
        records.append(record)

pd.DataFrame(records).to_csv("salaries_by_college_major_updated.csv", index=False)


# **Now reading the CSV file which is generated from the above.**

---



In [4]:
df = pd.read_csv('salaries_by_college_major_updated.csv')
df.head()

Unnamed: 0,Major,Early Career Pay,Mid-Career Pay,% High Meaning
0,Petroleum Engineering,92300.0,182000.0,69%
1,Electrical Engineering & Computer Science (EECS),101200.0,152300.0,46%
2,Applied Economics and Management,60900.0,139600.0,67%
3,Operations Research,78400.0,139600.0,52%
4,Public Accounting,60000.0,138800.0,49%


# **Data Exploration and Data Cleaning**

In [5]:
df.shape

(834, 4)

In [6]:
df.columns

Index(['Major', 'Early Career Pay', 'Mid-Career Pay', '% High Meaning'], dtype='object')

In [7]:
df.isna()

Unnamed: 0,Major,Early Career Pay,Mid-Career Pay,% High Meaning
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
829,False,False,False,False
830,False,False,False,False
831,False,False,False,False
832,False,False,False,False


# **Minimum and Maximum Salary in Early Career**

---


In [None]:
# Minimum Salary in Early Career
min_early_pay = df.sort_values('Early Career Pay')
min_early_pay[['Major', 'Early Career Pay']].head()

Unnamed: 0,Major,Early Career Pay
707,Developmental Psychology,31000.0
571,Painting & Printmaking,32800.0
814,Voice & Opera,32900.0
829,Early Childhood Education,34100.0
828,Child & Family Studies,34100.0


In [None]:
# Maximum Salary in Early Career
max_early_pay = df.sort_values('Early Career Pay', ascending=False)
max_early_pay[['Major', 'Early Career Pay']].head()

Unnamed: 0,Major,Early Career Pay
1,Electrical Engineering & Computer Science (EECS),101200.0
74,Physician Assistant Studies,92900.0
0,Petroleum Engineering,92300.0
49,Metallurgical Engineering,79100.0
3,Operations Research,78400.0


In [None]:
# Another Way to Find the Largest Value from the DataFrame
df.nlargest(5, 'Early Career Pay')

Unnamed: 0,Major,Early Career Pay,Mid-Career Pay,% High Meaning
1,Electrical Engineering & Computer Science (EECS),101200.0,152300.0,46%
74,Physician Assistant Studies,92900.0,112200.0,86%
0,Petroleum Engineering,92300.0,182000.0,69%
49,Metallurgical Engineering,79100.0,117700.0,55%
3,Operations Research,78400.0,139600.0,52%


In [None]:
df.nsmallest(5, 'Early Career Pay')

Unnamed: 0,Major,Early Career Pay,Mid-Career Pay,% High Meaning
707,Developmental Psychology,31000.0,62000.0,-
571,Painting & Printmaking,32800.0,71100.0,41%
814,Voice & Opera,32900.0,50800.0,61%
785,Rehabilitation Services,34100.0,55700.0,79%
828,Child & Family Studies,34100.0,43600.0,74%


#**Maximum and Minimum Salary in Mid Career**

---



In [None]:
df.nlargest(5, 'Mid-Career Pay')

Unnamed: 0,Major,Early Career Pay,Mid-Career Pay,% High Meaning
0,Petroleum Engineering,92300.0,182000.0,69%
1,Electrical Engineering & Computer Science (EECS),101200.0,152300.0,46%
2,Applied Economics and Management,60900.0,139600.0,67%
3,Operations Research,78400.0,139600.0,52%
4,Public Accounting,60000.0,138800.0,49%


In [None]:
df.nsmallest(5, 'Mid-Career Pay')

Unnamed: 0,Major,Early Career Pay,Mid-Career Pay,% High Meaning
833,Metalsmithing,38300.0,38400.0,32%
832,Addictions Counseling,38800.0,42200.0,-
831,Medical Assisting,35100.0,42300.0,-
830,Mental Health,35200.0,42500.0,-
829,Early Childhood Education,34100.0,43300.0,78%
