In [None]:
import pandas as pd
from lxml import html
import requests
# pd.set_option('display.max_rows', 20)  # Change 100 to your desired number

In [None]:
df = pd.read_csv('../data/jobposts/20250623-vancouver.csv')

In [None]:
# count
print(df.shape)
print(df['job_url'].unique().shape)

In [None]:
MIN_CHARS = 2

# clean keywords
tmp = df['position'].str.lower() + ' ' + df['company'].str.lower()
tmp = tmp.apply(lambda text : ''.join(c if c.isalpha() else ' ' for c in text))
tmp = tmp.str.split().apply(set)
df['keywords'] = tmp.apply(lambda kws: {w for w in kws if len(w) >= MIN_CHARS})

# get word frequencies
keywords = ' '.join(df['keywords'].str.join(' ')).split()
wf_data = [{'keyword' : w, 'count' : keywords.count(w)} for w in set(keywords)]

# rank keywords
wf = pd.DataFrame(wf_data)
wf = wf.sort_values('count', ascending=False)
wf['rank'] = range(len(wf))
wf = wf.set_index('keyword')

rank_dict = wf.to_dict()['rank']
df['word_ranks'] = df['keywords'].apply(lambda words : [rank_dict[word] for word in words if word in rank_dict])
df['min_rank'] = df['word_ranks'].apply(lambda x : min(x) if x else None)

# characteristic rank of job in Vancouver
df['mean_rank'] = df['word_ranks'].apply(lambda x : sum(x)/len(x) if x else None).round(2)
df = df.sort_values('mean_rank')

df['kws_rank'] = df['keywords'].apply(
    lambda kws : ', '.join(
        f'{kw} ({rank_dict[kw]:,})' for kw in sorted(kws, key=lambda kw:rank_dict[kw])
        )
    )

df['job_count'] = 1
pvt = df.pivot_table(
    index=['position', 'company', 'location', 'min_rank', 'mean_rank', 'kws_rank'],
    values='job_count',
    aggfunc='count'
    )

pvt = pvt.reset_index()
pvt = pvt.sort_values('mean_rank')
pvt = pvt.reset_index(drop=True)

In [None]:
includes = ['policy', 'research', 'data'] # or
excludes = ['assistant', 'university', 'senior', 'doctoral', 'intern', 'phd'] # and

# initialize condition
cond = pvt['kws_rank'].isna()

for w in includes:
    cond = cond | (pvt['kws_rank'].str.contains(w))

for w in excludes:
    cond = cond & (~pvt['kws_rank'].str.contains(w))

# candidate positions
res = pvt[cond].reset_index(drop=True)

on = ['position', 'company', 'location']
res = pd.merge(res, df[on+['job_url', 'firm_url']], on=on, how='inner', )
res

In [None]:
for entry in res.iloc:
    print(entry['job_url'])

In [None]:
import requests
from lxml import html
import time
import json
import re
from datetime import datetime, timezone

def extract_job_criteria(tree):
    """Extract job criteria items using more robust XPath"""
    criteria = {}
    items = tree.xpath('//li[contains(@class, "description__job-criteria-item")]')
    
    for item in items:
        header = item.xpath('.//h3[contains(@class, "description__job-criteria-subheader")]/text()')
        value = item.xpath('.//span[contains(@class, "description__job-criteria-text")]/text()')
        
        if header and value:
            key = header[0].strip().lower().replace(' ', '_')
            criteria[key] = value[0].strip()
    
    return criteria

def extract_salary(tree, description):
    """Extract salary information from both dedicated element and description"""
    # Try dedicated salary element
    salary_element = tree.xpath('//div[contains(@class, "salary")]/text()')
    if salary_element:
        return salary_element[0].strip()
    
    # Try regex patterns in description
    patterns = [
        r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?',  # $XX,XXX.XX
        r'\$\d{1,3}(?:,\d{3})*\s*[-–]\s*\$\d{1,3}(?:,\d{3})*',  # $XX - $YY
        r'\d{1,3}(?:,\d{3})*\s*(?:USD|CAD)',  # XX,XXX CAD
        r'[A-Z]{3}\s?\d{1,3}(?:,\d{3})*'  # CAD XX,XXX
    ]
    
    for pattern in patterns:
        match = re.search(pattern, description, re.IGNORECASE)
        if match:
            return match.group(0)
    
    return None

def scrape_job(url, max_retries=3, base_delay=1):
    """Scrape job information with simplified rate limiting"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    }
    
    job_data = {
        "url": url,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "job_title": None,
        "company": None,
        "location": None,
        "posted_time": None,
        "job_description": None,
        "applicant_count": None,
        "employer_profile": None,
        "apply_link": None,
        "salary": None,
        "seniority_level": None,
        "employment_type": None,
        "job_function": None,
        "industries": None
    }
    
    try:
        response = None
        for attempt in range(max_retries):
            try:
                response = requests.get(url, headers=headers, timeout=15)
                if response.status_code == 429:
                    time.sleep(attempt + base_delay)  # Linear backoff
                    continue
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as e:
                if attempt == max_retries - 1:
                    raise
                time.sleep(attempt + base_delay)
        
        tree = html.fromstring(response.content)
        
        # Extract core job information
        job_data["job_title"] = safe_extract(tree, '//h1[contains(@class, "top-card-layout__title")]/text()')
        job_data["company"] = safe_extract(tree, '//a[contains(@class, "topcard__org-name-link")]/text()')
        job_data["location"] = safe_extract(tree, '//span[contains(@class, "topcard__flavor--bullet")]/text()')
        job_data["posted_time"] = safe_extract(tree, '//span[contains(@class, "posted-time-ago__text")]/text()')
        
        # Extract job description
        desc_nodes = tree.xpath('//div[contains(@class, "show-more-less-html__markup")]')
        job_data["job_description"] = " ".join(desc_nodes[0].xpath(".//text()")).strip() if desc_nodes else None
        
        # Extract salary information
        if job_data["job_description"]:
            job_data["salary"] = extract_salary(tree, job_data["job_description"])
        
        # Extract additional metadata
        job_data["applicant_count"] = safe_extract(tree, '//span[contains(@class, "num-applicants__caption")]/text()')
        job_data["employer_profile"] = safe_extract(tree, '//a[contains(@class, "topcard__org-name-link")]/@href')
        job_data["apply_link"] = safe_extract(tree, '//a[contains(@class, "apply-button")]/@href')
        
        # Extract job criteria
        criteria = extract_job_criteria(tree)
        job_data.update({
            "seniority_level": criteria.get("seniority_level"),
            "employment_type": criteria.get("employment_type"),
            "job_function": criteria.get("job_function"),
            "industries": criteria.get("industries")
        })
        
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
    
    return job_data

def safe_extract(tree, xpath):
    """Safely extract first match from XPath or return None"""
    result = tree.xpath(xpath)
    return result[0].strip() if result else None


In [None]:
# Configuration
JOB_URLS = res['job_url'].tolist()
MAX_RETRIES = 10
BASE_DELAY = 1  # seconds


# Scrape all jobs
results = []
for i, url in enumerate(JOB_URLS):
    
    print(f"Processing {i+1}/{len(JOB_URLS)}: {url}")
    job_data = scrape_job(url, MAX_RETRIES, BASE_DELAY)
    results.append(job_data)
    
    print(json.dumps(job_data, indent=2))

# Save results
with open("linkedin_jobs.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Successfully scraped {len(results)} jobs")
