## 010 Retrieving Skill Sets in Job Postings
This notebook shows how we gathered a dataset of job postings. 

In [27]:
import requests
from bs4 import BeautifulSoup
import nltk
import matplotlib.pyplot as plt
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
from datetime import datetime
import time
import random
from langdetect import detect
import os
import pickle

In [2]:
#Browser header makes web scraping work easier
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',}

## Functions

This function scrapes indeed for a given input term for a given country.

In [3]:
def get_pages(search_term, page_count, country): #returns a list of search links given an input search term
    pages = []
    for page_no in range(page_count):
        page = 'https://'+country+'.indeed.com/jobs?q='+search_term+'&start='+str(page_no*10)
        pages.append(page)
    return pages

This function gets a list of all job postings on one of Indeed search pages. It is combined with function get_pages().

In [5]:
def get_links(search_link, country): #Takes in one search page and returns all job links on the page
    links = []
    page = requests.get(search_link, headers = headers)
    search_soup = BeautifulSoup(page.text)
    #print(search_soup)
    mydivs = search_soup.find_all("div", {"class": "mosaic mosaic-provider-jobcards"})
    for div in mydivs:
        classes = div.find_all("a")
        for c in classes:
            sub = c.get('data-jk')
            if isinstance(sub, str):
                link = 'https://'+country+'.indeed.com/viewjob?jk='+sub
            if link not in links:
                links.append(link)
    return links

This function takes in a list of urls, created by get_links(). It visits each link and saves the response in a txt file.

In [14]:
def get_requests(urls): #Takes in a job link and saves request.text to file
    for url in urls:
        captcha = True
        while captcha:
            response = requests.get(url, headers = headers)
        
            jk = url[33:]
            path = "..\datasets\requests\\"+jk
        
            soup = BeautifulSoup(response.text)
            title = soup.find("title")
        
            if "hCaptcha solve page" not in title: # not a captcha
                captcha = False
            else:
                time.sleep(1200)
            
        with open(path, "w") as file:
            file.write(response.text)
            time.sleep(10+random.uniform(-1,1)) #behave a bit like human
     

## Scraping 
In this section a set of job postings is scraped from the web, it is stored in a set of response text files. 

In [10]:
countries = ['se','no','nl','de','be','es','it','ch','fr','uk','pt','fi','lu','at']
link_list = []
for country in countries:
    pages = get_pages("Data Scientist", 10, country)
    for page in pages:
        links = get_links(page, country)
        link_list.append(links)
        time.sleep(10+random.uniform(-1,1)) #behave a bit like human
with open("link_list.txt", "wb") as fp:
    pickle.dump(link_list, fp)

In [21]:
print(link_list)

[['https://nl.indeed.com/viewjob?jk=16f151177a6de429', 'https://nl.indeed.com/viewjob?jk=64bc5ad1e3b61059', 'https://nl.indeed.com/viewjob?jk=a5f719a1728e7074', 'https://nl.indeed.com/viewjob?jk=a7712338f646bd55', 'https://nl.indeed.com/viewjob?jk=984b6301bd1ffb16', 'https://nl.indeed.com/viewjob?jk=23a7e7cfb60d054e', 'https://nl.indeed.com/viewjob?jk=6b5f267033d3b2c3', 'https://nl.indeed.com/viewjob?jk=33213b318cb79506', 'https://nl.indeed.com/viewjob?jk=33ca3ecf5e219745', 'https://nl.indeed.com/viewjob?jk=9060ff5fee52893c', 'https://nl.indeed.com/viewjob?jk=851446284e9a6f45', 'https://nl.indeed.com/viewjob?jk=066501a6c6c3589f', 'https://nl.indeed.com/viewjob?jk=faf312d38dd2cc41', 'https://nl.indeed.com/viewjob?jk=a7dab3a06bddd0aa', 'https://nl.indeed.com/viewjob?jk=78935aa3094b1501']]


In [24]:
for list in link_list:
    get_requests(list)   

## Retrieving a dataset from the acquired http responses

In [25]:
def request_to_row(request):
    with open("../datasets/requests/"+ request) as file:
        text = file.read()
        soup = BeautifulSoup(text)  
        full_text = soup.find("div",{"id": "jobDescriptionText"}).text
        if(detect(full_text) != 'en'): #Only parse English job postings
            return 
        
        timestamp = None
        content = []
        title_location = soup.find("title").text
        title = title_location.split("-")[0]
        location = title_location.split("-")[1]
        url = soup.find_all("meta", {"id": "indeed-share-url"})
        url = str(url[0])[15:-25]
        
        country = url[8:10]

        list_elements = []
    
        divs = soup.find_all("div",{"id": "jobDescriptionText"})
        for div in divs:
            uls = div.find_all("ul")
            for ul in uls:
                for li in ul.find_all('li'):
                    list_elements.append(li.text)
    return [timestamp,url,title,location,country,full_text,list_elements]

In [40]:
dir = '../datasets/requests'
df = pd.DataFrame(columns = ["dt","url","title","location","country","full_text","list_elements"])
for filename in os.listdir(dir):
    row_list = request_to_row(filename)
    df.loc[len(df)] = row_list 
df_.to_csv("../datasets/requests/df_europe")

## Combining with dataset downloaded from Kaggle

In [42]:
df_dl=pd.read_csv('../datasets/data_scientist_united_states_job_postings_jobspikr.csv')
df_dl = df_dl.dropna(subset=['html_job_description']) # Without the full html the job postings are not useful to us

df_dl=df_dl[["crawl_timestamp","url","job_title","inferred_city","job_description","html_job_description"]]# selecting columns
countries = []
list_of_list_elements = []
for index, row in df_dl.iterrows():
    countries.append("us")
    soup = BeautifulSoup(row["html_job_description"])
    list_elements = []
    
    divs = soup.find_all("div",{"id": "jobDescriptionText"})
    for div in divs:
        uls = div.find_all("ul")
        for ul in uls:
            for li in ul.find_all('li'):
                list_elements.append(li.text)
    list_of_list_elements.append(list_elements)
    
df_dl["country"]=countries
df_dl["list_elements"] = list_of_list_elements
df_dl = df_dl.drop(["html_job_description"], axis = 1)
df_dl.columns = ["dt","url","title","location","full_text","country","list_elements"]
df_dl = df_dl.reindex(columns=["dt","url","title","location","country","full_text","list_elements"])

In [45]:
#Appending the european dataset with the USA downloaded dataset
df_full = df.append(df_dl)
df_full = df_full.drop_duplicates(subset=['location','full_text'])
df_full.to_csv('../datasets/df_full.csv')
len(df_full)

2633