## Jobs scraping

This notebook extracts job offers from elempleo.com

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import wordcloud
from selenium import webdriver
import time

# Url of the main webpage
url = "http://www.elempleo.com/co/ofertas-empleo/"

# Number of pages to be scrapped
nopag = 4

# Initialize browser in page
browser = webdriver.Chrome()
browser.get(url)

# Get html code clicking the next button to navigate the table
pages = []
for _ in range(nopag):
    
    # Save current page content
    html = browser.page_source
    pages.append(BeautifulSoup(html, 'html.parser'))
    
    # Find and click "next" button
    button = browser.find_element_by_xpath("/html/body/div[8]/div[4]/div[1]/div[4]/div/nav/ul/li[8]/a")
    button.click()
    
    # Wait for content to load
    time.sleep(10)
    
browser.close()

In [3]:
# Get job tables
tables = [ x.select(".container .row .result-list")[0] for x in pages ]

# Find jobs from tables
jobs = [ x.select(".result-item") for x in tables]

# Collapse jobs into a single list
jobs = [ job for sublist in jobs for job in sublist]

In [4]:
def san(s):
    s = [re.sub("\\n|\\t|\\r","",x) for x in s]
    s = [re.sub("^ +","",x) for x in s]
    s = [re.sub(" +$","",x) for x in s]
    return(s)

titles = san([x.select(".text-ellipsis")[0].get_text() for x in jobs])

salaries = san([x.select(".info-salary")[0].get_text() for x in jobs])
cities = san([x.select(".info-city")[0].get_text() for x in jobs])
companies = san([x.select(".info-company-name")[0].get_text() for x in jobs])
dates = san([x.select(".info-publish-date")[0].get_text() for x in jobs])
links = [x.select("div a")[0]['href'] for x in jobs]

dates = [re.sub("^Publicado ","",x) for x in dates]

In [5]:
def getProfession(url,prof_link):
    # Check if link includes url
    if url in prof_link:
        page = requests.get(prof_link)
    else:
        page = requests.get(url + prof_link)
    page = BeautifulSoup(page.content, 'html.parser')
    
    titles = page.select("div #ProfessionLightBox #messageResponse li")
    titles = [x.get_text() for x in titles]
    return("\n".join(titles))
    
url = "http://www.elempleo.com"

profs = [getProfession(url,link) for link in links]

In [6]:
jobs_tab = pd.DataFrame({
        "date": dates, 
        "firm": companies, 
        "city": cities,
        "title": titles,
        "salary": salaries,
        "profession": profs,
        "link": links
    })
jobs_tab

Unnamed: 0,date,firm,city,title,salary,profession,link
0,18 Jul 2018,Ingenian Software,Bogotá y ...,(007sa) experto hl7,Salario confidencial,Ingeniería de sistemas Computación,/co/ofertas-trabajo/sa-experto-hl/1883482018
1,18 Jul 2018,Empresa confidencial,Bogotá y ...,Jefe de logística y bodega,"$3 a $3,5 millones",Ingeniería de alimentos\nIngeniería industrial...,/co/ofertas-trabajo/jefe-de-logistica-y-bodega...
2,18 Jul 2018,Ingenian Software,Bogotá y ...,(008ca) capacitadores para módulos,Salario confidencial,Ingeniería de sistemas Computación,/co/ofertas-trabajo/ca-capacitadores-para-modu...
3,18 Jul 2018,Empresa confidencial,Bogotá y ...,Inspector de obra,Salario confidencial,Construcción,/co/ofertas-trabajo/inspector-de-obra/1883492488
4,18 Jul 2018,Empresa confidencial,Bogotá y ...,Asistente bilingüe (francés),"$2,5 a $3 millones",Secretariado\nAdministración de empresas\nOtra,/co/ofertas-trabajo/asistente-bilingue-frances...
5,18 Jul 2018,Ingenian Software,Bogotá y ...,(009se) lider desarrollador .net,Salario confidencial,Ingeniería de sistemas Computación,/co/ofertas-trabajo/se-lider-desarrollador-net...
6,18 Jul 2018,Empresa confidencial,Ibagué,Asesor comercial bancario jornada adicional ib...,Menos de $1 millón,Administración de empresas\nAdministración de ...,/co/ofertas-trabajo/asesor-comercial-bancario-...
7,18 Jul 2018,Empresa confidencial,Bogotá y ...,Revenue mgmt asst analyst,"$2 a $2,5 millones",Ingeniería industrial\nAdministración de empre...,/co/ofertas-trabajo/revenue-mgmt-asst-analyst/...
8,18 Jul 2018,Ingenian Software,Bogotá y ...,(010se) ingeniero desarrollador semi senior,Salario confidencial,Ingeniería de sistemas Computación,/co/ofertas-trabajo/se-ingeniero-desarrollador...
9,18 Jul 2018,Empresa confidencial,Bogotá,Consultor junior corporativo regional bogotá/c...,"$1,5 a $2 millones",Contaduría\nIngeniería industrial\nAdministrac...,/co/ofertas-trabajo/consultor-junior-corporati...
