In [106]:
# Importing Libraries
from bs4 import BeautifulSoup
import requests
from nltk.tokenize import sent_tokenize

In [104]:
def get_soup_object(url):
    """
    Function to return the soup object for the input URL
    Input: URL (string)
    output: BeautifulSoup object
    """
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "lxml")
    return soup


def get_title(soup):
    """
    Function to get the title of the job from the imput soup object
    Input: BeautifulSoup object
    output: Job Title (string)
    """
    return soup.find('h1').text

def get_job_desc(soup):
    """
    Function to return the job description for a given soup object
    Input: BeautifulSoup object
    output: Job Title (list of strings)
    """
    div = soup.find('div', {'class': 'show-more-less-html__markup'})
    sentences = [i.strip() for i in sent_tokenize(text = div.text)]
    return sentences

def get_job_features(soup):
    """
    Function to get some features related to the job
    Input: BeautifulSoup object
    output: features about job (dictionary)
    """
    feat = soup.find('ul', {'class': "description__job-criteria-list"})
    feats = [i.strip() for i in re.split(r'(\n){2}', feat.text)]
    feats = [i for i in feats if len(i)>5]
    feat_dict = {}
    for i in range(0,len(feats)-1,2):
        feat_dict[feats[i]] = feats[i+1]
    return feat_dict

In [107]:
URL = "https://ca.linkedin.com/jobs/view/data-consultant-at-lancesoft-inc-3464918474?refId=RyUuys20i1QivQXdjkYKeg%3D%3D&trackingId=J9MA0ZsIJZs2QVY7FA4AKw%3D%3D&trk=public_jobs_topcard-title"
soup = get_soup_object(URL)
str(soup)[:1000]

'<!DOCTYPE html>\n<html lang="en">\n<head>\n<meta content="d_jobs_guest_details" name="pageKey"/>\n<!-- --> <meta content="en_US" name="locale"/>\n<meta data-app-version="2.0.1256" data-browser-id="a3969546-942c-4ea8-85ef-8f3363ae0911" data-call-tree-id="AAX07iodg5biJ3gDHJeOrA==" data-disable-jsbeacon-pagekey-suffix="false" data-enable-page-view-heartbeat-tracking="" data-member-id="0" data-multiproduct-name="jobs-guest-frontend" data-page-instance="urn:li:page:d_jobs_guest_details;xCGLgpAlSQegEDrA1VzPQw==" data-service-name="jobs-guest-frontend" id="config"/>\n<link href="https://ca.linkedin.com/jobs/view/data-consultant-at-lancesoft-inc-3464918474" rel="canonical"/>\n<!-- --><!-- -->\n<!-- -->\n<!-- -->\n<meta content="https://ca.linkedin.com/jobs/view/data-consultant-at-lancesoft-inc-3464918474" property="al:android:url"/>\n<meta content="com.linkedin.android" property="al:android:package"/>\n<meta content="LinkedIn" property="al:android:app_name"/>\n<meta content="https://ca.linked

### Getting Title of Job Posting

In [108]:
get_title(soup)

'Data Consultant'

### Getting raw text from job description
##### This is the document that we would annotate to get skills out of the job description.

In [112]:
get_job_desc(soup)

['Must be located in British Columbia:Specific Qualifications or Experience Required:Technical Expertise: Hands-on extensive experience designing and implementing Purview Hands-on experience with Data Governance processes and procedures Hands-on experience building PoCs to develop understanding of data governance tooling Overall capability: Strong analytical, problem solving and system analysis skills including the ability to comprehend complex large scale environment operational issues, preferably in health care Excellent interpersonal, oral and written communication skills with the ability to communicate complex ideas in simple terminology Excellent time management skills with the ability to organize and prioritize work Persistence and a positive attitude are essential Ability to work in a dynamic and agile environment with changing requirements and priorities Ability to take direction to define and implement solutions as requirements are being identified and finalized Keeping the bi

### Getting raw features related to Job description
##### These are just some extra features (not to be confused with annotation, we are manually annotating skills required for the job)

In [110]:
get_job_features(soup)

{'Seniority level': 'Mid-Senior level',
 'Employment type': 'Contract',
 'Job function': 'Other and Health Care Provider',
 'Industries': 'IT Services and IT Consulting and Hospitals and Health Care'}