In [3]:
"""
Created on Tue Apr 28 11:35:04 2020
@author: chrislovejoy
"""

import urllib
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import glob
import os
import sys  
sys.path.insert(1, 'C:/Users/tom/projects/skill-skeleton/utils/')

import spacy_util

def find_jobs_from(website: str, job_title: str, location: str, desired_characs: list, filename: str="results", pages: int=1):    
    """
    This function extracts all the desired characteristics of all new job postings
    of the title and location specified and returns them in single file.
    The arguments it takes are:
        - Website: to specify which website to search (options: 'Indeed' or 'CWjobs')
        - Job_title
        - Location
        - Desired_characs: this is a list of the job characteristics of interest,
            from titles, companies, links and date_listed.
        - Filename: to specify the filename and format of the output.
            Default is .xls file called 'results.xls'
    """
       
    if website == 'Indeed':
        start=0
        while start < pages:
            job_soup = load_indeed_jobs_div(job_title, location, start)                                         
            jobs_list, num_listings = extract_job_information_indeed(job_title,job_soup, desired_characs)
                                          
            start += 1    
            
            save_jobs_to_csv(jobs_list, filename + '_' + str(start))    
            print('{} new job postings retrieved from {}. Stored in {}.'.format(num_listings, website, filename))
    
    #save_jobs_to_excel(jobs_list, filename)   
    

## ======================= GENERIC FUNCTIONS ======================= ##

def save_jobs_to_excel(jobs_list: list, filename: str):
    jobs = pd.DataFrame(jobs_list)
    jobs.to_excel(filename+ ".xlsx",index=True, index_label="Id")


def save_jobs_to_csv(jobs_list: list, filename: str):
    jobs = pd.DataFrame(jobs_list)
    jobs.to_csv(filename + ".csv",index=True, index_label="Id")

## ================== FUNCTIONS FOR INDEED.COM =================== ##


def load_indeed_job_descriptions_from_list(jobs_list: list):
    job_descs = []
    for i in range(len(jobs_list)):
        job_descs.append(load_indeed_job_description(jobs_list[i]))
    return job_descs


def load_indeed_job_description(job: str):
    driver = webdriver.Chrome("chromedriver.exe")
    driver.get(job)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    job_desc = soup.find('div', class_='jobsearch-jobDescriptionText')
    return job_desc.text.strip()


def load_indeed_jobs_div(job_title: str, location: str, start: int=0) -> BeautifulSoup:
    if start == 0:
        getVars = {'q' : job_title, 'rbl' : location, 'fromage' : 'last', 'sort' : 'date'}
    else:
        param_start = (start+1)*10
        
        getVars = {'q' : job_title, 'rbl' : location, 'fromage' : 'last', 'sort' : 'date', 'start' : param_start}
    
    url = ('https://www.indeed.com/jobs?' + urllib.parse.urlencode(getVars))    
    print(url)
    driver = webdriver.Chrome()
    driver.get(url)    
    soup = BeautifulSoup(driver.page_source, "html.parser")    
    job_soup = soup.find(id="mosaic-jobResults")
    driver.close()
    
    return job_soup


def extract_job_information_indeed(job_title: str, job_soup: BeautifulSoup, desired_characs: list) -> tuple[dict,int]:
    job_elems = job_soup.find_all('div', class_='cardOutline') 
         
    cols = []
    extracted_info = []
    
    cols.append('profile')
    extracted_info.append(job_title)   
    
    if 'titles' in desired_characs:
        titles = []
        cols.append('titles')
        for job_elem in job_elems:
            titles.append(extract_job_title_indeed(job_elem))
        extracted_info.append(titles)                    
    
    if 'companies' in desired_characs:
        companies = []
        cols.append('companies')
        for job_elem in job_elems:
            companies.append(extract_company_indeed(job_elem))
        extracted_info.append(companies)
    
    if 'links' in desired_characs:
        links = []
        data = []
        cols.append('links')        
        cols.append('data')
        driver2 = webdriver.Chrome()
        
        for job_elem in job_elems:
            temp_link = extract_link_indeed(job_elem)            
            links.append(temp_link)          
                        
            try:
                driver2.get(temp_link)                   
                soup = BeautifulSoup(driver2.page_source, "html.parser")
                job_desc = soup.find('div', class_='jobsearch-JobComponent-description')         
                data.append(job_desc.text.strip().replace('|', '-').replace("&nbsp;"," ").replace("\n"," "))
            except:
                data.append('None')
        
        driver2.close()            
        extracted_info.append(links)
        extracted_info.append(data)
    
    if 'date_listed' in desired_characs:
        dates = []
        cols.append('date_listed')
        for job_elem in job_elems:
            dates.append(extract_date_indeed(job_elem))
        extracted_info.append(dates)    
    
    jobs_list = {}
    
    for j in range(len(cols)):
        jobs_list[cols[j]] = extracted_info[j]
    
    num_listings = len(extracted_info[0]) 
    
    
    return jobs_list, num_listings


def extract_job_title_indeed(job_elem) -> str:
    title_elem = job_elem.find('h2', class_='jobTitle')
    title = title_elem.text.strip()
    return title


def extract_company_indeed(job_elem) -> str:
    company_elem = job_elem.find('div', class_='company_location')
    company = company_elem.text.strip()
    return company


def extract_link_indeed(job_elem) -> str:
    link = job_elem.find('a')['href']
    link = 'http://www.indeed.com' + link
    return link


def extract_date_indeed(job_elem) -> str:
    date_elem = job_elem.find('span', class_='date')
    date = "NA"
    
    if date_elem is not None:
        date = date_elem.text.strip()
        
    return date

def delete_csv_files():
    files = glob.glob('*.csv')
    for f in files:
        os.remove(f)

In [4]:
desired_characs = ['titles', 'companies', 'links', 'date_listed']
pages = 7
output_file = 'indeed-data.csv'

#Clear folder of csv files
delete_csv_files()


#['Data Analyst','Data Engineer','Data Scientist','Business Analyst','Software Engineer','Machine Learning Engineer','Cloud Engineer']


#Data Scientist
#find_jobs_from('Indeed', 'Data Analyst', '', desired_characs, filename="data_analyst_jobs", pages=pages)
#Data Engineer
#find_jobs_from('Indeed', 'Data Engineer', '', desired_characs, filename="data_engineer_jobs", pages=pages)
#Data Scientist
find_jobs_from('Indeed', 'Data Scientist', '', desired_characs, filename="data_scientist_jobs", pages=pages)

#Out of scope - We need more data for less profiles
#Machine Learning Engineer
#find_jobs_from('Indeed', 'Machine Learning Engineer', '', desired_characs, filename="machine_learning_engineer_jobs", pages=pages)
#Cloud Engineer
#find_jobs_from('Indeed', 'Cloud Engineer', '', desired_characs, filename="cloud_engineer_jobs", pages=pages)
#Business Analyst
#find_jobs_from('Indeed', 'Business Analyst', '', desired_characs, filename="business_analyst_jobs", pages=pages)
#Software Engineer
#find_jobs_from('Indeed', 'Software Engineer', '', desired_characs, filename="software_engineer_jobs", pages=pages)


csv_files = glob.glob('*.{}'.format('csv'))

print(csv_files)  
     
df = pd.concat([pd.read_csv(file) for file in csv_files ], ignore_index=True)

df['data'] = df['data'].apply(spacy_util.remove_punctuation)  

# Specify the columns to consider when looking for duplicates
columns = ['titles', 'companies']

# drop duplicates
df = df.drop_duplicates(subset=columns)
 
df.to_csv(output_file,sep='|',index=False, encoding='utf-8')

#NER --- df['data']

https://www.indeed.com/jobs?q=Data+Scientist&rbl=&fromage=last&sort=date
14 new job postings retrieved from Indeed. Stored in data_scientist_jobs.
https://www.indeed.com/jobs?q=Data+Scientist&rbl=&fromage=last&sort=date&start=20
14 new job postings retrieved from Indeed. Stored in data_scientist_jobs.
https://www.indeed.com/jobs?q=Data+Scientist&rbl=&fromage=last&sort=date&start=30
14 new job postings retrieved from Indeed. Stored in data_scientist_jobs.
https://www.indeed.com/jobs?q=Data+Scientist&rbl=&fromage=last&sort=date&start=40
14 new job postings retrieved from Indeed. Stored in data_scientist_jobs.
https://www.indeed.com/jobs?q=Data+Scientist&rbl=&fromage=last&sort=date&start=50
14 new job postings retrieved from Indeed. Stored in data_scientist_jobs.
https://www.indeed.com/jobs?q=Data+Scientist&rbl=&fromage=last&sort=date&start=60
14 new job postings retrieved from Indeed. Stored in data_scientist_jobs.
https://www.indeed.com/jobs?q=Data+Scientist&rbl=&fromage=last&sort=date&