# LinkedIn Job Scraping

### Version 1.0
#### Created by Long

### Overview
This notebook is designed to scrape tech-related job postings from LinkedIn. The goal is to extract key information such as job titles, company names, locations, job links, posting dates, and Easy Apply availability. The data is then saved into a CSV file for further analysis.

### How it Works
- **Data Source**: The HTML content of LinkedIn job search result pages is fetched using the `requests` library.
- **Web Scraping**: `BeautifulSoup` is used to parse the HTML and extract job details like titles, companies, and locations.
- **Data Output**: The extracted data is stored in a CSV file, making it easy to analyze or manipulate for further use.
- **Customization**: Filters can be applied manually on LinkedIn before fetching the data to focus on specific job criteria like location, job type, or posting date.

### Future Enhancements
In future iterations, this notebook can be expanded to:
- Automatically generate the LinkedIn job search URL based on user keyword input.
- Extract more detailed data from the job postings, such as experience level, required skills, and qualifications.



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# Take User input and generate the LinkedIn URL

def generate_linkedin_url():
    # User input
    keywords = input("Enter job keywords (e.g., 'Cyber Security'): ").strip()
    location = input("Enter location (Calgary or Edmonton): ").strip().capitalize()
    distance = input("Enter distance in miles (e.g., 25): ").strip()

    # Time posted options
    print("Select time period for job posting:")
    print("1. Last 24 hours")
    print("2. Last week")
    print("3. Last month")
    time_choice = input("Enter 1, 2, or 3: ").strip()

    # Map time choice to f_TPR values
    time_mapping = {
        '1': 'r86400',   # Last 24 hours
        '2': 'r604800',  # Last week
        '3': 'r2592000'  # Last month
    }

    # GeoId based on location
    location_mapping = {
        'Calgary': '102199904',
        'Edmonton': '106535873'
    }

    # Validate user input
    if location not in location_mapping:
        print(f"Invalid location: {location}")
        return

    if time_choice not in time_mapping:
        print(f"Invalid time period: {time_choice}")
        return

    # Generate URL
    geo_id = location_mapping[location]
    f_tpr = time_mapping[time_choice]

    # Save the URL in the PAGE_URL variable
    PAGE_URL = (f"https://www.linkedin.com/jobs/search?keywords={keywords.replace(' ', '%20')}"
                f"&location={location}&geoId={geo_id}&distance={distance}&f_TPR={f_tpr}&position=1&pageNum=0")

    return PAGE_URL


# Call the function
PAGE_URL = generate_linkedin_url()

# Print the URL for verification (optional)
print(f"LinkedIn URL generated and saved: {PAGE_URL}")

Enter job keywords (e.g., 'Cyber Security'): Data Analyst
Enter location (Calgary or Edmonton): Edmonton
Enter distance in miles (e.g., 25): 25
Select time period for job posting:
1. Last 24 hours
2. Last week
3. Last month
Enter 1, 2, or 3: 2
LinkedIn URL generated and saved: https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=Edmonton&geoId=106535873&distance=25&f_TPR=r604800&position=1&pageNum=0


In [None]:
# Manually filter on LinkedIn then add the link
# PAGE_URL = 'https://www.linkedin.com/jobs/search/?currentJobId=4030602244&f_PP=102199904%2C106535873&f_TPR=r2592000&geoId=103564821&keywords=cybersecurity&origin=JOB_SEARCH_PAGE_JOB_FILTER&refresh=true&sortBy=R'

In [None]:
# Extract the html of the page
def get_html_of(url):
    resp = requests.get(url)

    if resp.status_code != 200:
        print(f'HTTP status code of {resp.status_code} returned, but 200 was expected.')
        print("Please delete the 'linkedin_search_results.html' file and try again after waiting for some time.")
        print('Exiting...')
        exit(1)

    return resp.content.decode()

html_content = get_html_of(PAGE_URL)
with open("linkedin_search_results.html", "w", encoding="utf-8") as file:
    file.write(html_content)

print("HTML content saved to 'linkedin_search_results.html'")

HTML content saved to 'linkedin_search_results.html'


In [None]:
# Function to extract job details
def extract_jobs(html):
    soup = BeautifulSoup(html, 'html.parser')
    jobs = []

    for job_card in soup.find_all('div', class_='base-search-card__info'):
        title = job_card.find('h3', class_='base-search-card__title').text.strip()
        company = job_card.find('h4', class_='base-search-card__subtitle').text.strip()
        location = job_card.find('span', class_='job-search-card__location').text.strip()

        # Extract job link
        job_link = job_card.find('a')['href']

        # Extract date posted
        date_posted = job_card.find('time', class_='job-search-card__listdate').text.strip() if job_card.find('time', class_='job-search-card__listdate') else "Not listed"

        # Check if the job is an "Easy Apply"
        easy_apply = job_card.find('span', class_='apply-button--easy').text.strip() if job_card.find('span', class_='apply-button--easy') else "No"

        # Add extracted data to list
        jobs.append({
            'Job Title': title,
            'Company': company,
            'Location': location,
            'Job Link': job_link,
            'Date Posted': date_posted,
            'Easy Apply': easy_apply
        })

    return jobs

# Get HTML content
html_content = get_html_of(PAGE_URL)

In [None]:
# Extract job details
job_listings = extract_jobs(html_content)

# Convert the job listings to a pandas DataFrame
df = pd.DataFrame(job_listings)

# Save the DataFrame to a CSV file
df.to_csv('linkedin_job_listings.csv', index=False)

print("Job listings have been saved to 'linkedin_job_listings.csv'")

Job listings have been saved to 'linkedin_job_listings.csv'
