In [1]:
from splinter import Browser
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import numpy as np
import pymongo

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.jobs_db
db.jobs_glassdoor.drop()
jobs_glassdoor_mongo = db.jobs_glassdoor

In [4]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [6]:
job_dict = {}

In [5]:
job_categories = ["Data Analyst", "Data Scientist"]
region = "United States"
posted_time = "7"
for job_category in job_categories:
    
    glassdoor_url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=' + job_category + \
    "&locT=N&locId=1&locKeyword=" + region + "&jobType=all&fromAge=" + posted_time + \
    "&minSalary=0&includeNoSalaryJobs=true&radius=-1&cityId=-1&minRating=0.0&industryId=-1" +\
    "&companyId=-1&applicationType=0&employerSizes=0&remoteWorkType=0"

    browser.visit(glassdoor_url)
    job_counter = 1
    for x in range(1, 20):
        print("Page no.:", x, "--------")
        
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')

        job_section = soup.find('ul', class_='jlGrid')
        job_list = job_section.find_all('li', class_='jl')

        for job in job_list:
            # Job Title
            job_title = job.find('div', class_='jobTitle').a.text            
            # Company Name
            company_text = job.find('div', class_='empLoc').text.split("–")
            compay_name = company_text[0][1:][:-1] # formatting to remove the spaces in beginning and end
            # City and State
            location = job.find('div', class_='empLoc').span.text.split(", ")          
            # Salary information - Min, Max and Avg
            try:
                salary_text = job.find('span', class_="green").text.replace("(Glassdoor Est.)", "").replace("(Employer Est.)", "")
                sal_range = salary_text[1:].split("-")
                min_sal = int(sal_range[0].lstrip("$").rstrip("k"))
                max_sal = int(sal_range[1].lstrip("$").rstrip("k"))
                avg_sal = (max_sal+min_sal)/2   
            except AttributeError:
                min_sal = None
                max_sal = None
                avg_sal = None                
                
            print('job no.:', job_counter, '-------------')
            job_counter +=1

            job_dict = {"Title": job_title,
                        "Company": company_name,
                        "City": location[0],
                        "State": location[-1],
                        "Designation": job_category,
                        "Min Salary": min_sal,
                        "Max Salary": max_sal,
                        "Avg Salary", avg_sal,
                        "Source": "Glassdoor"
                        }
            jobs_glassdoor_mongo.insert_one(job_dict)

        try:
            browser.click_link_by_partial_text('Next')
             #If a pop-up window opens, close    
        except:
            browser.find_by_css('.xBtn').click()
        
        sleep(np.random.choice(range(4,7)))
    
    sleep(10)
    

In [8]:
browser.quit()

In [9]:
jobs_glassdoor_listing = db.jobs_glassdoor.find()
for listing in jobs_glassdoor_listing:
    print(listing)

{'_id': ObjectId('5c06046f0ea05e5c540c71cb'), 'Title': 'Data Analyst', 'Company': ' GameChanger ', 'City': 'United States', 'State': 'United States', 'Designation': 'Data Analyst', 'Salary': '', 'Source': 'Glassdoor'}
{'_id': ObjectId('5c06046f0ea05e5c540c71cc'), 'Title': 'Data Analyst', 'Company': ' T. Rowe Price ', 'City': 'Baltimore', 'State': 'MD', 'Designation': 'Data Analyst', 'Salary': ' $69k-$90k', 'Source': 'Glassdoor'}
{'_id': ObjectId('5c06046f0ea05e5c540c71cd'), 'Title': 'Data Analyst', 'Company': ' SMC 3 ', 'City': 'Peachtree City', 'State': 'GA', 'Designation': 'Data Analyst', 'Salary': ' $53k-$72k', 'Source': 'Glassdoor'}
{'_id': ObjectId('5c06046f0ea05e5c540c71ce'), 'Title': 'Data Analyst', 'Company': ' Trividia Health ', 'City': 'Fort Lauderdale', 'State': 'FL', 'Designation': 'Data Analyst', 'Salary': ' $57k-$78k', 'Source': 'Glassdoor'}
{'_id': ObjectId('5c06046f0ea05e5c540c71cf'), 'Title': 'Data Analyst', 'Company': ' Walmart eCommerce ', 'City': 'Sunnyvale', 'State