In [1]:
import pandas as pd #basic library always need
import requests # web scraper - to grab data from the job postings
from bs4 import BeautifulSoup # parse html -most websites
import re # for pattern matching like finding salary ranges
import time # to not overwhem websites

In [2]:
# define a function that will pull the information we want from the url/links given. 
def extract_job_info(description):
    # Normalize text/put everything in lowercase makes the function case-insensitive.
    text = description.lower()

    # Work type 
    # note there a field in the inital dataset by this name but it has all N/A's and provides no data definition so I'm repurposing it. 
    if 'remote' in text or 'work from anywhere' in text:
        work_type = 'Remote'
    elif 'hybrid' in text or 'hybrid schedule' in text or 'split time' in text:
        work_type = 'Hybrid'
    elif 'onsite' in text or 'on-site' in text or 'in office' in text:
        work_type = 'Onsite'
    else:
        work_type = 'Unknown'

    # Job type
    # this is also a pre-exsisting field that wasn't defined or populated so I'm reportposing. 
    if 'full-time' in text or 'full time' in text:
        job_type = 'Full-time'
    elif 'part-time' in text or 'part time' in text:
        job_type = 'Part-time'
    else:
        job_type = 'Unknown'

    # Salary extraction
    # this is the first truly new field created, looks for pattern of characters that a salary range would generally follow 
    salary_match = re.search(r'(\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s*[-–to]{1,3}\s*(\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', text)
    if salary_match:
        min_salary = salary_match.group(1)
        max_salary = salary_match.group(2)
        salary_range = f"{min_salary} - {max_salary}"
    else:
        single_salary = re.search(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
        salary_range = single_salary.group(0) if single_salary else 'Not specified'
        min_salary = max_salary = salary_range if salary_range != 'Not specified' else 'Not specified'

    return work_type, job_type, salary_range, min_salary, max_salary

In [3]:
# PART 2: SCRAPER FUNCTION TO GET TEXT FROM URL
def fetch_description(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        res = requests.get(url, headers=headers, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'html.parser')

        # Try getting visible text from paragraphs and divs
        paragraphs = soup.find_all(['p', 'div', 'span'])
        text = ' '.join([p.get_text(separator=' ', strip=True) for p in paragraphs])
        return text
    except Exception as e:
        print(f"Error fetching description for {url}: {e}")
        return ""

In [4]:
# PART 3: MAIN PROCESS
#remember there are 618 urls this WILL take ...a while I recommend leaving it running and coming back later
#also often job posted close and are removed after a set time frame so you WILL have missing data. 
# Load URLs from CSV (column named 'link')
df = pd.read_csv('C:/Users/holle/OneDrive/Documents/Data Sets (Public)/LinkedIn Pull of Analytics Jobs.csv')  # update to your file path if needed

results = []

for idx, row in df.iterrows():
    url = row['link']  # 'link' instead of 'url' to match actual CSV column
    print(f"Processing: {url}")
    jd_text = fetch_description(url)

    if not jd_text.strip():
        print(f"Warning: No content found for {url}")

    work_type, job_type, salary_range, min_salary, max_salary = extract_job_info(jd_text)

    results.append({
        'link': url,
        'work_type': work_type,
        'job_type': job_type,
        'salary_range': salary_range,
        'min_salary': min_salary,
        'max_salary': max_salary
    })

    time.sleep(1)  # make sure there's a pause, be nice to the servers and my machine


Processing: https://www.linkedin.com/jobs/view/data-analyst-at-meta-4186238974
Processing: https://www.linkedin.com/jobs/view/data-analyst-at-meta-4186241553
Processing: https://www.linkedin.com/jobs/view/data-analyst-at-meta-4186236994
Processing: https://www.linkedin.com/jobs/view/data-analyst-at-meta-4186237989
Processing: https://www.linkedin.com/jobs/view/data-analyst-ii-at-pinterest-4193349988
Processing: https://www.linkedin.com/jobs/view/data-analyst-at-fanduel-4206047782
Processing: https://www.linkedin.com/jobs/view/data-analyst-production-finance-operations-innovation-at-netflix-4205626465
Processing: https://www.linkedin.com/jobs/view/data-analyst-marketing-at-fanduel-4138322262
Processing: https://www.linkedin.com/jobs/view/data-analyst-at-sbh-fashion-4168179268
Processing: https://www.linkedin.com/jobs/view/data-analyst-ii-at-pinterest-4193356222
Processing: https://www.linkedin.com/jobs/view/data-analyst-ii-at-pinterest-4193351793
Processing: https://www.linkedin.com/job

In [9]:
# PART 4: OUTPUT TO FILE
output_df = pd.DataFrame(results)
output_df.to_csv(r'C:\Users\holle\OneDrive\Documents\Data Sets (Public)\LinkedIn Pull of Analytics Jobs+added_logic+fields.csv', index=False)
print("Done! Results saved to specified file path.")

Done! Results saved to specified file path.
