### Data collecting: Web scraping using Selenium

In this part of the project I'm working on scraping jobs data from Glassdoor.com

To start my code I have used a jupyter notebook from github 'https://github.com/arapfaik/scraping-glassdoor-selenium/blob/master/glassdoor%20scraping.ipynb' and I have changed different parts of it and I have modified all the XPATHS as the HTML code of the Glassdoor web page have changed a lot since the first code was written. 

**You can see at the end of this code what the desired dataframe will look like**

In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import time
import pandas as pd

In [12]:
def get_jobs_dataframe(keyword, num_jobs, verbose, path, slp_time):
    
    '''Scraping "Glassdoor" to gather jobs as a dataframe
    '''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    driver = webdriver.Chrome(executable_path=path, options=options)
    driver.set_window_size(1000, 1000)
    url = "https://www.glassdoor.com/Job/jobs.htm?context=Jobs&suggestCount=0&suggestChosen=false&clickSource=searchBox&typedKeyword=" + keyword + "&sc.keyword=" + keyword
    driver.get(url)
    jobs = []
    j=0

    time.sleep(slp_time)
    
    element = driver.find_elements_by_id('onetrust-accept-btn-handler')
    if len(element):
        print("element is present")
        element[0].click()
        element.clear()
    else:
        print("element is not present")
    
    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.
 
        #Wait until the webpage is loaded
        time.sleep(slp_time)
        
        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element_by_xpath('.//span[@class="SVGInline modal_closeIcon"]').click()  #clicking to the X.
        except NoSuchElementException:
            pass
  
        #Going through each job in this page
        job_buttons = driver.find_elements_by_xpath('//li[@data-test="jobListing"]')

        print("looking for job list")
    
        for job_button in job_buttons: 
            general_info_dic = {'Company name':-1, 'Location':-1, 'Job title':-1, 'Job description':-1, 'Salary estimate':-1, 'Rating':-1}
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            driver.implicitly_wait(3) # seconds
            
            if len(jobs) >= num_jobs:
                break
             
            print(job_button.text)
            
            
            actions = ActionChains(driver)
            actions.move_to_element(job_button)
            actions.perform()
            driver.implicitly_wait(5)
            actions.click()
            actions.perform()
            driver.implicitly_wait(5) # seconds

            
            try:
                driver.find_element_by_xpath('//span[@class="SVGInline modal_closeIcon"]').click()  #clicking to the X.
            except NoSuchElementException:
                pass
            
            j+=1
            print(j)
            collected_successfully = False
            while not collected_successfully:
                try:
                    general_info_dic['Company name'] = driver.find_element_by_xpath('//div[@class="css-87uc0g e1tk4kwz1"]').text.split("\n")[0]
                    general_info_dic['Location'] = driver.find_element_by_xpath('//div[@class="css-56kyx5 e1tk4kwz5"]').text
                    general_info_dic['Job title'] = driver.find_element_by_xpath('//div[@class="css-1vg6q84 e1tk4kwz4"]').text
                    general_info_dic['Job description'] = driver.find_element_by_xpath('//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    time.sleep(5)


            try:
                general_info_dic['Salary estimate'] = driver.find_element_by_xpath('//span[@class="css-56kyx5 css-16kxj2j e1wijj242"]').text
            except NoSuchElementException:
                general_info_dic['Salary estimate'] = -1 #You need to set a "not found value. It's important."
            try:
                general_info_dic['Rating'] = driver.find_element_by_xpath('//span[@class="css-1m5m32b e1tk4kwz2"]').text
            except NoSuchElementException:
                general_info_dic['Rating'] = -1 

            
            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(general_info_dic['Job title']))
                print("Salary Estimate: {}".format(general_info_dic['Salary estimate']))
                print("Job Description: {}".format(general_info_dic['Job description'][:600]))
                print("Rating: {}".format(general_info_dic['Rating']))
                print("Company Name: {}".format(general_info_dic['Company name']))
                print("Location: {}".format(general_info_dic['Location']))

            company_overview_dic = {'Size':-1, 'Founded':-1, 'Type':-1, 'Industry':-1, 'Sector':-1, 'Revenue':-1}
            #Going to the Company tab
            try:
                
                target = driver.find_element_by_xpath('//div[@data-item="tab" and @data-tab-type="overview"]')
                target.click()

                provided_titles = []
                provided_contents = []
                try:
                    provided_titles = driver.find_elements_by_xpath('//span[@class="css-1taruhi e1pvx6aw1"]')         
                    provided_contents = driver.find_elements_by_xpath('//span[@class="css-i9gxme e1pvx6aw2"]')
                except:
                    pass
                if len (provided_titles) > 0:
                    for i in range(len(provided_titles)):
                        title = provided_titles[i].text
                        content = provided_contents[i].text
                        company_overview_dic[str(title)] = content

            except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                print("some job postings do not have the Company tab")

            Size = company_overview_dic['Size']
            Founded = int(company_overview_dic['Founded'])
            Type = company_overview_dic['Type']
            Industry = company_overview_dic['Industry']
            Sector = company_overview_dic['Sector']
            Revenue = company_overview_dic['Revenue']

            if verbose:
                print("Size: {}".format(Size))
                print("Founded: {}".format(Founded))
                print("Type: {}".format(Type))
                print("Industry: {}".format(Industry))
                print("Sector: {}".format(Sector))
                print("Revenue: {}".format(Revenue))
                print("//////////////////////////////////////////////")

            jobs.append({"Job Title" : general_info_dic['Job title'],
            "Salary Estimate" : general_info_dic['Salary estimate'],
            "Job Description" : general_info_dic['Job description'],
            "Rating" : general_info_dic['Rating'],
            "Company Name" : general_info_dic['Company name'],
            "Location" : general_info_dic['Location'],
            "Size" : Size,
            "Founded" : Founded,
            "Type of ownership" : Type,
            "Industry" : Industry,
            "Sector" : Sector,
            "Revenue" : Revenue})
            

            #Test for the "Sign Up" prompt and get rid of it.
            try:
                driver.find_element_by_xpath('//span[@class="SVGInline modal_closeIcon"]').click()  #clicking to the X.
            except NoSuchElementException:
                pass

        #Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('//li[@class="css-1yshuyv e1gri00l3"]').click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break
        time.sleep(3)  
        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element_by_xpath('//span[@class="SVGInline modal_closeIcon"]').click()  #clicking to the X.
        except NoSuchElementException:
            pass

    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame.

In [13]:
path = "C:\Program Files (x86)\chromedriver"
df = get_jobs_dataframe("Data Scientist", 100, True, path, 10)

element is present
looking for job list
Progress: 0/100
3.5
GEICO
Principal Data Scientist
Chevy Chase, MD
$104K - $170K (Glassdoor est.)
Actively Hiring
6d
1
Job Title: Principal Data Scientist
Salary Estimate: $104K - $170K (Glassdoor est.)
Job Description: Working out of our Chevy Chase, MD/Washington DC office, GEICO's Data Science team uses predictive analytics and innovative machine learning models to create value from data. We solve problems across GEICO, from Marketing to Claims and Underwriting, and are responsible for developing and driving strategic modeling initiatives. We see our projects through the entire data science lifecycle, from problem definition to data exploration, data munging, modeling, analysis, and deployment into production systems. We maintain a close partnership with IT to ensure that our models can be deployed quickly
Rating: 3.5
Company Name: GEICO
Location: Chevy Chase, MD
Size: 10000+ Employees
Founded: 1936
Type: Subsidiary or Business Segment
Industr

Progress: 8/100
4.2
ManTech International Corporation
Data Scientist
Alexandria, VA
$64K - $106K (Glassdoor est.)
Hot
25d
9
Job Title: Data Scientist
Salary Estimate: $64K - $106K (Glassdoor est.)
Job Description: Secure our Nation, Ignite your Future
Overview
Each day U.S. Customs and Border Protection (CBP) oversees the massive flow of people, capital, and products that enter and depart the United States via air, land, sea, and cyberspace. The volume and complexity of both physical and virtual border crossings require the application of solutions to promote efficient trade and travel. Further, effective solutions help CBP ensure the movement of people, capital, and products is legal, safe, and secure.
As a Data Scientist on our team, a proven track record of delivering production ready decision suppor
Rating: 4.2
Company Name: ManTech International Corporation
Location: Alexandria, VA
Size: 5001 to 10000 Employees
Founded: 1968
Type: Company - Public
Industry: Research & Development


Size: 10000+ Employees
Founded: 1970
Type: Nonprofit Organization
Industry: Health Care Services & Hospitals
Sector: Health Care
Revenue: $500 million to $1 billion (USD)
//////////////////////////////////////////////
Progress: 16/100
3.6
Alteryx, Inc.
Data Engineer
Ann Arbor, MI
$53K - $102K (Glassdoor est.)
24h
17
Job Title: Data Engineer
Salary Estimate: $53K - $102K (Glassdoor est.)
Job Description: We’re looking for problem solvers, innovators, and dreamers who are searching for anything but business as usual. Like us, you’re a high performer who’s an expert at your craft, constantly challenging the status quo. You value inclusivity and want to join a culture that empowers you to show up as your authentic self. You know that success hinges on commitment, that our differences make us stronger, and that the finish line is always sweeter when the whole team crosses together.
As a Data Engineer on the Alteryx Data Science team, you will be part of an innovative and groundbreaking team

Size: 51 to 200 Employees
Founded: 1989
Type: Company - Private
Industry: Biotech & Pharmaceuticals
Sector: Biotech & Pharmaceuticals
Revenue: $25 to $50 million (USD)
//////////////////////////////////////////////
Progress: 24/100
3.9
Johns Hopkins Health System
Sr. Data Scientist-Precision Medicine
Baltimore, MD
$95K - $154K (Glassdoor est.)
Hiring Surge
21d
25
Job Title: Sr. Data Scientist-Precision Medicine
Salary Estimate: $95K - $154K (Glassdoor est.)
Job Description: Sr. Data Scientist-Precision Medicine
Requisition #: 333944
Location: Johns Hopkins Health System, Baltimore, MD 21201
Category: Information Technology
Work Shift: Day Shift
Work Week: Full Time (40 or 36 hours)
Weekend Work Required: No
Date Posted: March 23, 2021
Sr Data Scientist - Precision Medicine
Requisition ID 333944
Work Location: This position will be remote and have the option to transition to Johns Hopkins locations around Baltimore, MD
Rating: 3.9
Company Name: Johns Hopkins Health System
Location: Balt

Size: 5001 to 10000 Employees
Founded: 1992
Type: Company - Public
Industry: Lending
Sector: Finance
Revenue: $5 to $10 billion (USD)
//////////////////////////////////////////////
Progress: 32/100
4.1
JLG
Data Scientist Principal
Hagerstown, MD
$95K - $152K (Glassdoor est.)
Actively Hiring
5d
33
Job Title: Data Scientist Principal
Salary Estimate: $109K - $175K (Glassdoor est.)
Job Description: About JLG, an Oshkosh company
JLG began in 1969, when our founder, John L. Grove set out to resolve growing safety concerns in the construction industry. Since then we have been committed to understanding the challenges and delivering innovative solutions to the access market. We partner with customers to provide quality equipment, training opportunities and trusted support within the access industry. We are a global company, and our products—including mobile elevating work platforms, telehandlers, utility vehicles and accessories—can be found all over the world.
JOB SUMMARY:
The Data Scientis


some job postings do not have the Company tab
Size: -1
Founded: -1
Type: -1
Industry: -1
Sector: -1
Revenue: -1
//////////////////////////////////////////////
Progress: 40/100
NEMO'S BAKERY
(FOOD SCIENTIST) Sr. Product Development Technologist
California
Easy Apply
14d
41
Job Title: (FOOD SCIENTIST) Sr. Product Development Technologist
Salary Estimate: $109K - $175K (Glassdoor est.)
Job Description: Horizon Food Groupis a strategic bakery and sweet snack company selling to grocery, convenience store, mass merchant and food service customers throughout the U.S. A leader in the single serve segment products.
Title: Sr. Product Development Technologist
Reports to: Director of Marketing Services
Direct Reports: None
Location: Escondido, California
SalaryRange: Based on Candidate Skills, Experience and Education
Rating: -1
Company Name: NEMO'S BAKERY
Location: California
some job postings do not have the Company tab
Size: -1
Founded: -1
Type: -1
Industry: -1
Sector: -1
Revenue: -1
/////////

Progress: 48/100
Rockland Immunochemicals, Inc.
Associate Scientist (Production & Purification)
Limerick, PA
Easy Apply
7d
49
Job Title: Associate Scientist (Production & Purification)
Salary Estimate: $109K - $175K (Glassdoor est.)
Job Description: Job Summary:
The position plays a critical role in antibody development and supports the Production Department. Important aspects of this role include excellent organizational, multi-tasking, leadership and communication skills, and the ability to thrive in an efficient, highly interactive, and goal-oriented team environment. In addition to laboratory skills, strong project planning and management skills will be developed in this position.
Duties and Responsibilities will include:
Purify antibodies to support in-house and custom projects
Perform laboratory experiments independently or with oth
Rating: -1
Company Name: Rockland Immunochemicals, Inc.
Location: Limerick, PA
some job postings do not have the Company tab
Size: -1
Founded: -1
Typ

Progress: 56/100
4.4
Applied Information Sciences
Machine Learning Engineer
Reston, VA
$69K - $121K (Glassdoor est.)
10d
57
Job Title: Machine Learning Engineer
Salary Estimate: $109K - $175K (Glassdoor est.)
Job Description: Intro (Use Font Arial 12):
As a Machine Learning Engineer, you will be a part of the team responsible for making it easy to deploy, manage, and monitor machine learning models at scale in production. You will join a team that developes the infrastructure to facilitate model orchestration. Collaborate with both data scientists and platform engineers to create intuitive and resilient solutions. You will also be expected to follow the best practices of Agile software development, including high coding quality with testing coverage, and continuous integration and deployment of your solutions.
What
Rating: 4.4
Company Name: Applied Information Sciences
Location: Reston, VA
Size: 501 to 1000 Employees
Founded: 1982
Type: Company - Private
Industry: IT Services
Sector: I

Size: 51 to 200 Employees
Founded: -1
Type: Company - Public
Industry: -1
Sector: -1
Revenue: Less than $1 million (USD)
//////////////////////////////////////////////
Progress: 64/100
Avisyn Pharma
MSc/BSc Synthetic chemistry pharmaceutical research scientists, Groton, CT
Groton, CT
Viewed on 6 avril
Hot
Easy Apply
27d
65
Job Title: MSc/BSc Synthetic chemistry pharmaceutical research scientists, Groton, CT
Salary Estimate: $76K - $127K (Glassdoor est.)
Job Description: SYNTHETIC CHEMIST RESEARCH POSITIONS: Avisyn seeks PhD or highly experienced MSc-level synthetic chemists with experience in heterocycle, organometallic, and related pharmaceutical organic chemistry synthesis.
Required qualifications
· MSc or BSc with experience in synthetic chemistry.
· Experience in traditional and contemporary organic synthesis techniques.
· Familiarity with a wide range of techniques and reaction types such as low temperature reactions, inert atmosphere, metal-catalyzed cross coupling, hydrogenation

some job postings do not have the Company tab
Size: -1
Founded: -1
Type: -1
Industry: -1
Sector: -1
Revenue: -1
//////////////////////////////////////////////
Progress: 72/100
3.9
Lexmark International, Inc.
Data Engineer
Lexington-Fayette, KY
11d
73
Job Title: Data Engineer
Salary Estimate: $76K - $127K (Glassdoor est.)
Job Description: The Data Engineer will work with apart of the Global Analytics Team within Global Professional Services. This role will work with a variety of Lexmark teams including Service Operations, Engineering, R&D, and IT. This role will work directly with Lexmark’s customers as needed to develop new capabilities for Lexmark’s IoT initiative. Dive into technical project work to help Predictive Service Management increase customer up time and reduce Lexmark costs. Leverage and create cutting-edge predictive models and supporting technologies to build better services and support for Lexmark print and IoT 
Rating: 3.9
Company Name: Lexmark International, Inc.
Locat

Size: 51 to 200 Employees
Founded: 1995
Type: Company - Private
Industry: Consulting
Sector: Business Services
Revenue: $5 to $10 million (USD)
//////////////////////////////////////////////
Progress: 80/100
3.2
Pro-Sphere Tek
Data Scientist
Washington, DC
$60K - $104K (Glassdoor est.)
Actively Hiring
4d
81
Job Title: Data Scientist
Salary Estimate: $76K - $127K (Glassdoor est.)
Job Description: Overview


ProSphere is seeking a Data Scientist to provide services in development of new and enhancement of existing capabilities for information technology systems and applications for the Department of Veterans Affairs. The Data Scientist will lead efforts to expand the data science capabilities of customer’s analytics platform. The individual in this position will have a solid background in production-level machine learning systems with broad exposure to a wide variety of algorithmic techniques. Experience and demonstrable proof of impactful work in areas such as natural language processi


Size: 10000+ Employees
Founded: 1946
Type: Company - Private
Industry: Investment Banking & Asset Management
Sector: Finance
Revenue: $10+ billion (USD)
//////////////////////////////////////////////
Progress: 88/100
4.2
Vizient, Inc.
Health Care Data Scientist
Chicago, IL
$94K - $155K (Glassdoor est.)
3d
89
Job Title: Health Care Data Scientist
Salary Estimate: $76K - $127K (Glassdoor est.)
Job Description: When you’re the best, we’re the best. We instill an environment where employees feel engaged, satisfied and able to contribute their unique skills and talents. We provide extensive opportunities for personal and professional development, building both employee competence and organizational capability to fuel exceptional performance now and in the future.
Summary:
In this role, you will conduct sophisticated data analysis to help address critical business and member questions, the outputs of which enable members to drive improvements in clinical, operational, and economic outcomes.


Size: 10000+ Employees
Founded: 1929
Type: Nonprofit Organization
Industry: Research & Development
Sector: Business Services
Revenue: $5 to $10 billion (USD)
//////////////////////////////////////////////
Progress: 96/100
3.5
Freedom Financial Network
Director of Data Analytics
Tempe, AZ
$46K - $94K (Glassdoor est.)
Actively Hiring
6d
97
Job Title: Director of Data Analytics
Salary Estimate: $61K - $108K (Glassdoor est.)
Job Description: WHO WE ARE:
Freedom Financial Network is a family of companies that takes a people-first approach to financial services, using technology to empower consumers to overcome debt and create a brighter financial future. The company was founded in 2002 by Brad Stroh and Andrew Housser on the belief that by staying committed to helping people, you can ensure better financial outcomes for both the customer and the business. This Heart + $ philosophy still guides the vision of our growing company, which has helped millions of people find solutions for their fi

In [14]:
df

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Principal Data Scientist,$104K - $170K (Glassdoor est.),"Working out of our Chevy Chase, MD/Washington ...",3.5,GEICO,"Chevy Chase, MD",10000+ Employees,1936,Subsidiary or Business Segment,Insurance Carriers,Insurance,$10+ billion (USD)
1,Staff Data Scientist,$101K - $161K (Glassdoor est.),Alteryx is searching for a Staff Data Scientis...,3.6,"Alteryx, Inc.","Boston, MA",1001 to 5000 Employees,1997,Company - Public,Enterprise Software & Network Solutions,Information Technology,$100 to $500 million (USD)
2,R&D Scientist,$38K - $68K (Glassdoor est.),"LGC, Biosearch Technologies is seeking a detai...",3.5,LGC Limited,"Middleton, WI",1001 to 5000 Employees,1842,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable
3,Solid Form Scientist,$45K - $98K (Glassdoor est.),The ideal Solid Form Scientist candidate will ...,3.5,LGC Limited,"Petaluma, CA",1001 to 5000 Employees,1842,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable
4,"Scientist I, Analytical Development",-1,DESCRIPTION\nThe scientist I will be responsib...,-1,Matica Biotechnology,"College Station, TX",-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,Data Scientist III,$61K - $108K (Glassdoor est.),Battelle delivers when others can’t. We conduc...,3.7,Battelle,"Columbus, OH",10000+ Employees,1929,Nonprofit Organization,Research & Development,Business Services,$5 to $10 billion (USD)
96,Director of Data Analytics,$61K - $108K (Glassdoor est.),WHO WE ARE:\nFreedom Financial Network is a fa...,3.5,Freedom Financial Network,"Tempe, AZ",1001 to 5000 Employees,2002,Company - Private,Investment Banking & Asset Management,Finance,$500 million to $1 billion (USD)
97,Data Scientist,$61K - $108K (Glassdoor est.),Financial Data Scientist\nWe are looking for a...,-1,Prominent Global Solutions,"Fort Meade, MD",-1,-1,-1,-1,-1,-1
98,Data Scientist,$61K - $108K (Glassdoor est.),Business Group Highlights\n\n\nIntelligence\nT...,3.6,Perspecta,"McLean, VA",10000+ Employees,2018,Company - Public,Aerospace & Defense,Aerospace & Defense,Unknown / Non-Applicable


In [18]:
df.to_csv("glassdoor_job_offers.csv", index=False)

In [19]:
data = pd.read_csv("glassdoor_job_offers.csv")

In [20]:
data.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Principal Data Scientist,$104K - $170K (Glassdoor est.),"Working out of our Chevy Chase, MD/Washington ...",3.5,GEICO,"Chevy Chase, MD",10000+ Employees,1936,Subsidiary or Business Segment,Insurance Carriers,Insurance,$10+ billion (USD)
1,Staff Data Scientist,$101K - $161K (Glassdoor est.),Alteryx is searching for a Staff Data Scientis...,3.6,"Alteryx, Inc.","Boston, MA",1001 to 5000 Employees,1997,Company - Public,Enterprise Software & Network Solutions,Information Technology,$100 to $500 million (USD)
2,R&D Scientist,$38K - $68K (Glassdoor est.),"LGC, Biosearch Technologies is seeking a detai...",3.5,LGC Limited,"Middleton, WI",1001 to 5000 Employees,1842,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable
3,Solid Form Scientist,$45K - $98K (Glassdoor est.),The ideal Solid Form Scientist candidate will ...,3.5,LGC Limited,"Petaluma, CA",1001 to 5000 Employees,1842,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable
4,"Scientist I, Analytical Development",-1,DESCRIPTION\nThe scientist I will be responsib...,-1.0,Matica Biotechnology,"College Station, TX",-1,-1,-1,-1,-1,-1
