### Data collecting: Web scraping using Selenium

In this part of the project I'm working on scraping jobs data from Glassdoor.com

To start my code I have used a jupyter notebook from github 'https://github.com/arapfaik/scraping-glassdoor-selenium/blob/master/glassdoor%20scraping.ipynb' and I have changed different parts of it and I have modified all the XPATHS as the HTML code of the Glassdoor web page have changed a lot since the first code was written. 

**You can see at the end of this code what the desired dataframe will look like**

In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import time
import pandas as pd

In [12]:
def get_jobs_dataframe(keyword, num_jobs, verbose, path, slp_time):
    
    '''Scraping "Glassdoor" to gather jobs as a dataframe
    '''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    driver = webdriver.Chrome(executable_path=path, options=options)
    driver.set_window_size(1000, 1000)
    url = "https://www.glassdoor.com/Job/jobs.htm?context=Jobs&suggestCount=0&suggestChosen=false&clickSource=searchBox&typedKeyword=" + keyword + "&sc.keyword=" + keyword
    driver.get(url)
    jobs = []
    j=0

    time.sleep(slp_time)
    
    element = driver.find_elements_by_id('onetrust-accept-btn-handler')
    if len(element):
        print("element is present")
        element[0].click()
        element.clear()
    else:
        print("element is not present")
    
    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.
 
        #Wait until the webpage is loaded
        time.sleep(slp_time)
        
        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element_by_xpath('.//span[@class="SVGInline modal_closeIcon"]').click()  #clicking to the X.
        except NoSuchElementException:
            pass
  
        #Going through each job in this page
        job_buttons = driver.find_elements_by_xpath('//li[@data-test="jobListing"]')

        print("looking for job list")
    
        for job_button in job_buttons: 
            general_info_dic = {'Company name':-1, 'Location':-1, 'Job title':-1, 'Job description':-1, 'Salary estimate':-1, 'Rating':-1}
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            driver.implicitly_wait(3) # seconds
            
            if len(jobs) >= num_jobs:
                break
             
            print(job_button.text)
            
            
            actions = ActionChains(driver)
            actions.move_to_element(job_button)
            actions.perform()
            driver.implicitly_wait(5)
            actions.click()
            actions.perform()
            driver.implicitly_wait(5) # seconds

            
            try:
                driver.find_element_by_xpath('//span[@class="SVGInline modal_closeIcon"]').click()  #clicking to the X.
            except NoSuchElementException:
                pass
            
            j+=1
            print(j)
            collected_successfully = False
            while not collected_successfully:
                try:
                    general_info_dic['Company name'] = driver.find_element_by_xpath('//div[@class="css-87uc0g e1tk4kwz1"]').text.split("\n")[0]
                    general_info_dic['Location'] = driver.find_element_by_xpath('//div[@class="css-56kyx5 e1tk4kwz5"]').text
                    general_info_dic['Job title'] = driver.find_element_by_xpath('//div[@class="css-1vg6q84 e1tk4kwz4"]').text
                    general_info_dic['Job description'] = driver.find_element_by_xpath('//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    time.sleep(5)


            try:
                general_info_dic['Salary estimate'] = driver.find_element_by_xpath('//span[@class="css-56kyx5 css-16kxj2j e1wijj242"]').text
            except NoSuchElementException:
                general_info_dic['Salary estimate'] = -1 #You need to set a "not found value. It's important."
            try:
                general_info_dic['Rating'] = driver.find_element_by_xpath('//span[@class="css-1m5m32b e1tk4kwz2"]').text
            except NoSuchElementException:
                general_info_dic['Rating'] = -1 

            
            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(general_info_dic['Job title']))
                print("Salary Estimate: {}".format(general_info_dic['Salary estimate']))
                print("Job Description: {}".format(general_info_dic['Job description'][:600]))
                print("Rating: {}".format(general_info_dic['Rating']))
                print("Company Name: {}".format(general_info_dic['Company name']))
                print("Location: {}".format(general_info_dic['Location']))

            company_overview_dic = {'Size':-1, 'Founded':-1, 'Type':-1, 'Industry':-1, 'Sector':-1, 'Revenue':-1}
            #Going to the Company tab
            try:
                
                target = driver.find_element_by_xpath('//div[@data-item="tab" and @data-tab-type="overview"]')
                target.click()

                provided_titles = []
                provided_contents = []
                try:
                    provided_titles = driver.find_elements_by_xpath('//span[@class="css-1taruhi e1pvx6aw1"]')         
                    provided_contents = driver.find_elements_by_xpath('//span[@class="css-i9gxme e1pvx6aw2"]')
                except:
                    pass
                if len (provided_titles) > 0:
                    for i in range(len(provided_titles)):
                        title = provided_titles[i].text
                        content = provided_contents[i].text
                        company_overview_dic[str(title)] = content

            except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                print("some job postings do not have the Company tab")

            Size = company_overview_dic['Size']
            Founded = int(company_overview_dic['Founded'])
            Type = company_overview_dic['Type']
            Industry = company_overview_dic['Industry']
            Sector = company_overview_dic['Sector']
            Revenue = company_overview_dic['Revenue']

            if verbose:
                print("Size: {}".format(Size))
                print("Founded: {}".format(Founded))
                print("Type: {}".format(Type))
                print("Industry: {}".format(Industry))
                print("Sector: {}".format(Sector))
                print("Revenue: {}".format(Revenue))
                print("//////////////////////////////////////////////")

            jobs.append({"Job Title" : general_info_dic['Job title'],
            "Salary Estimate" : general_info_dic['Salary estimate'],
            "Job Description" : general_info_dic['Job description'],
            "Rating" : general_info_dic['Rating'],
            "Company Name" : general_info_dic['Company name'],
            "Location" : general_info_dic['Location'],
            "Size" : Size,
            "Founded" : Founded,
            "Type of ownership" : Type,
            "Industry" : Industry,
            "Sector" : Sector,
            "Revenue" : Revenue})
            

            #Test for the "Sign Up" prompt and get rid of it.
            try:
                driver.find_element_by_xpath('//span[@class="SVGInline modal_closeIcon"]').click()  #clicking to the X.
            except NoSuchElementException:
                pass

        #Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('//li[@class="css-1yshuyv e1gri00l3"]').click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break
        time.sleep(3)  
        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element_by_xpath('//span[@class="SVGInline modal_closeIcon"]').click()  #clicking to the X.
        except NoSuchElementException:
            pass

    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame.

In [None]:
path = "C:\Program Files (x86)\chromedriver"
df = get_jobs_dataframe("Data Scientist", 100, True, path, 10)

In [14]:
df

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Principal Data Scientist,$104K - $170K (Glassdoor est.),"Working out of our Chevy Chase, MD/Washington ...",3.5,GEICO,"Chevy Chase, MD",10000+ Employees,1936,Subsidiary or Business Segment,Insurance Carriers,Insurance,$10+ billion (USD)
1,Staff Data Scientist,$101K - $161K (Glassdoor est.),Alteryx is searching for a Staff Data Scientis...,3.6,"Alteryx, Inc.","Boston, MA",1001 to 5000 Employees,1997,Company - Public,Enterprise Software & Network Solutions,Information Technology,$100 to $500 million (USD)
2,R&D Scientist,$38K - $68K (Glassdoor est.),"LGC, Biosearch Technologies is seeking a detai...",3.5,LGC Limited,"Middleton, WI",1001 to 5000 Employees,1842,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable
3,Solid Form Scientist,$45K - $98K (Glassdoor est.),The ideal Solid Form Scientist candidate will ...,3.5,LGC Limited,"Petaluma, CA",1001 to 5000 Employees,1842,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable
4,"Scientist I, Analytical Development",-1,DESCRIPTION\nThe scientist I will be responsib...,-1,Matica Biotechnology,"College Station, TX",-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,Data Scientist III,$61K - $108K (Glassdoor est.),Battelle delivers when others can’t. We conduc...,3.7,Battelle,"Columbus, OH",10000+ Employees,1929,Nonprofit Organization,Research & Development,Business Services,$5 to $10 billion (USD)
96,Director of Data Analytics,$61K - $108K (Glassdoor est.),WHO WE ARE:\nFreedom Financial Network is a fa...,3.5,Freedom Financial Network,"Tempe, AZ",1001 to 5000 Employees,2002,Company - Private,Investment Banking & Asset Management,Finance,$500 million to $1 billion (USD)
97,Data Scientist,$61K - $108K (Glassdoor est.),Financial Data Scientist\nWe are looking for a...,-1,Prominent Global Solutions,"Fort Meade, MD",-1,-1,-1,-1,-1,-1
98,Data Scientist,$61K - $108K (Glassdoor est.),Business Group Highlights\n\n\nIntelligence\nT...,3.6,Perspecta,"McLean, VA",10000+ Employees,2018,Company - Public,Aerospace & Defense,Aerospace & Defense,Unknown / Non-Applicable


In [18]:
df.to_csv("glassdoor_job_offers.csv", index=False)

In [19]:
data = pd.read_csv("glassdoor_job_offers.csv")

In [20]:
data.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Principal Data Scientist,$104K - $170K (Glassdoor est.),"Working out of our Chevy Chase, MD/Washington ...",3.5,GEICO,"Chevy Chase, MD",10000+ Employees,1936,Subsidiary or Business Segment,Insurance Carriers,Insurance,$10+ billion (USD)
1,Staff Data Scientist,$101K - $161K (Glassdoor est.),Alteryx is searching for a Staff Data Scientis...,3.6,"Alteryx, Inc.","Boston, MA",1001 to 5000 Employees,1997,Company - Public,Enterprise Software & Network Solutions,Information Technology,$100 to $500 million (USD)
2,R&D Scientist,$38K - $68K (Glassdoor est.),"LGC, Biosearch Technologies is seeking a detai...",3.5,LGC Limited,"Middleton, WI",1001 to 5000 Employees,1842,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable
3,Solid Form Scientist,$45K - $98K (Glassdoor est.),The ideal Solid Form Scientist candidate will ...,3.5,LGC Limited,"Petaluma, CA",1001 to 5000 Employees,1842,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable
4,"Scientist I, Analytical Development",-1,DESCRIPTION\nThe scientist I will be responsib...,-1.0,Matica Biotechnology,"College Station, TX",-1,-1,-1,-1,-1,-1
