Web Scraping for Job Listings
Objective:
In this exercise, you will use Web Scraping techniques to collect data from a job listing website, such
as Wuzzuf (or any similar job portal), and extract the following information:
1. Job Title
2. Company Name
3. Location
4. Job Type (Full-Time or Remote)
5. Required Experience
6. Job Description
7. Requirements

Goal:
Write a Python script that scrapes this data and stores it in a CSV or Excel file, where each record
contains this information for each job listing you extract.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
!pip install selenium
!apt-get update
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
chrome_options = Options()
chrome_options.add_argument("--headless")  # تشغيل بدون واجهة
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

Collecting selenium
  Downloading selenium-4.27.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.27.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.27.0-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.7/481.7 kB[0m [31m27.1 MB/s

In [6]:
import csv
driver = webdriver.Chrome(options=chrome_options)
url = "https://wuzzuf.net/search/jobs/?q=data%20scientist%20&a=hpb"
driver.get(url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, "html.parser")
job_elements = soup.find_all("div", class_="css-1gatmva")
jobs_data = []
for job_element in job_elements:
    job_title = job_element.find("h2").text.strip() if job_element.find("h2").text.strip() else "N/A"
    company_name = job_element.find("a", class_="css-17s97q8").text.strip() if job_element.find("a", class_="css-17s97q8") else "N/A"
    location = job_element.find("span", class_="css-5wys0k").text.strip() if job_element.find("span", class_="css-5wys0k") else "N/A"
    link = job_element.find("a", class_="css-o171kl")["href"] if job_element.find("a", class_="css-o171kl")["href"] else "N/A"
    # New features for assignment
    work_time  = job_element.find("span", class_="css-1ve4b75").text.strip() if job_element.find("span", class_="css-1ve4b75") else "N/A"
    work_type = job_element.find("span", class_="css-o1vzmt").text.strip() if job_element.find("span", class_="css-o1vzmt") else "N/A" # online or on site or hybrid
    how_long_ago = job_element.find("div", class_="css-do6t5g").text.strip() if job_element.find("div", class_="css-do6t5g") else "N/A"
    jobs_data.append([job_title, company_name, location, link, work_time, work_type, how_long_ago])
    detailed_jobs_data=[]
for job in jobs_data:
    job_title, company_name, location, job_link, work_time, work_type, how_long_ago = job
    try:
        driver.get(job_link)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        # New features for assignment
        description = soup.find("div", class_="css-1uobp1k").text.strip() if soup.find("div", class_="css-1uobp1k") else "N/A"
        requirements = soup.find("div", class_="css-1t5f0fr").text.strip() if soup.find("div", class_="css-1t5f0fr") else "N/A"
        elements = soup.find_all("span", class_="css-4xky9y")
        experience_needed = elements[0].text.strip() if len(elements) > 0 else "N/A"
        career_level = elements[1].text.strip() if len(elements) > 1 else "N/A"
        education_level = elements[2].text.strip() if len(elements) > 2 else "N/A"
        salary = elements[3].text.strip() if len(elements) > 3 else "N/A"
        detailed_jobs_data.append([job_title, company_name, location, work_time, work_type, how_long_ago, description, requirements, experience_needed, career_level, education_level, salary])
    except Exception as e:
        print ("failure to open link: ",e)
        continue
df = pd.DataFrame(detailed_jobs_data)
df.to_csv('detailed_jobs_data.csv', index=False)
