# IndeedScrap doc
These functions allow to perform web scrapping on Indeed platform, to collect job detail.
____
Requirements:

In [25]:
from urllib.request import urlopen, HTTPError
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import re

The *scrapPage()* function allows to scrap an html document from the URL.  
This is to avoid redundant code in the main function.

In [2]:
def scrapPage(url):
    with urlopen(url) as response:
        page = BeautifulSoup(response.read(), 'html.parser')
    return page

The *scrapID()* function collects the IDs of the job ads published on the active page.  
This information is the argument for the *'data-jk'* attribute in *'jobsearch-SerpJobCard'* class divisions

In [3]:
def scrapID(page):
    resultCol = page.find(id="resultsCol")
    setID = {
        jobcard["data-jk"]
        for jobcard in resultCol.findAll("div",
                                         {"class": "jobsearch-SerpJobCard"})
    }
    return setID

The *stripmatch()* function gets the match and the number of pages visited for the current search.  
Since the form of a number in thousands differs from one country to another, regular expressions are used to harmonize the result: a list greater than two means a result greater than one thousand.

In [57]:
def stripmatch(page):
    try:
        text = page.find(id="searchCountPages").text.strip()
    except AttributeError:
        repage = match = None
    else:
        numlist = [num for num in re.findall(r'-?\d+\.?\d*', text)]
        repage = int(numlist[0])
        if len(numlist) == 2:
            match = int(numlist[1])
        else:
            match = int(''.join(numlist[1:]))
    return repage, match

The **`scrapIndeedID()`** function extracts the IDs for each job for each country searched for.  
The site is divided into different country-independent subdomains (the site in one country does not have access to the data in the other), scraping is performed for each subdomain of the site.  
The number of results per page is arbitrarily set to 50.  
After page 101 of results, Indeed considers the ads to be irrelevant. These will not be kept.

In [58]:
def scrapIndeedID(searchList, countryList):
    setID = set()
    for search in searchList:
        search = search.replace(" ", "+")
        for country in countryList:
            country = country.lower()
            listID = set()
            limit = 50
            start = repage = count = 0
            match = None
            while (repage <= 101 or len(listID) < match):
                url = "https://{}.indeed.com/jobs?q={}&limit={}&start={}".format(
                    country, search, limit, start)
                try:
                    page = scrapPage(url)
                except HTTPError:
                    break
                else:
                    repage, match = stripmatch(page)
                    count += 1
                    if (match is None or repage < count):
                        break
                    else:
                        listID = listID.union(scrapID(page))
                        start += limit
            setID = setID.union(listID)
    return setID

In [59]:
ids = scrapIndeedID(["Data analyst rennes"],["FR"])
ids

{'13c0fcd2f51314b1',
 '26dd13f600815706',
 '2c33db85b0d10abd',
 '336e0f519279fdc3',
 '504c7b1339061c76',
 '5b1fe02f0c07a395',
 '5ec8f851ed1d63ca',
 '75cac45a456c7cf1',
 '84ab24c33e15277b',
 '89072c947f0c794a',
 '8ae9822078c37016',
 '8b354cc1c1134dec',
 '8ca3444ba516af37',
 '9366877d7515d09e',
 'aa572127075b638a',
 'abaf4ab6fa0b7578',
 'b0b2a0eb1416388c',
 'bf852ecbb8cedb46',
 'dec932fde796bb3b',
 'df902a5c51056179',
 'e6ed6645740f4c5d',
 'e84c3a7fa2589b2a'}