In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time


In [24]:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
)

driver = webdriver.Chrome(options=options)

driver.execute_cdp_cmd(
    "Page.addScriptToEvaluateOnNewDocument",
    {"source": "Object.defineProperty(navigator, 'webdriver', { get: () => undefined })"},
)


{'identifier': '2'}

In [25]:
base_url = "https://himalayas.app"
paths = [
    "/jobs/countries/oman/aws",
    "/jobs/countries/oman/azure",
    "/jobs/countries/oman/google-cloud",
    "/jobs/countries/oman/cloud-devops"
]


In [None]:
def get_jobs_from_category(category_path):
    current_url = base_url + category_path
    all_job_data = []
    visited_pages = set()

    while True:
        if current_url in visited_pages:
            print(f"Déjà visité : {current_url}")
            break
        visited_pages.add(current_url)

        driver.get(current_url)
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located(
                    (By.CSS_SELECTOR, "article.flex.flex-shrink-0.cursor-pointer")
                )
            )
        except:
            print("Timeout, aucune offre trouvée.")
            break

        jobs_on_page = driver.find_elements(By.CSS_SELECTOR, "article.flex.flex-shrink-0.cursor-pointer")
        print(f"{len(jobs_on_page)} offres trouvées sur la page")

        for job_block in jobs_on_page:
            try:
                title_elem = job_block.find_element(By.CSS_SELECTOR, "a.text-xl.font-medium.text-gray-900")
                title = title_elem.text.strip()
                url = title_elem.get_attribute("href")
             

                date_elem = job_block.find_element(By.CSS_SELECTOR, "time.text-gray-600")
                date_posted = date_elem.get_attribute("textContent").strip()

             




                company_elem = job_block.find_element(By.CSS_SELECTOR, "a.inline-flex.items-center.font-medium.text-gray-900")
                company = company_elem.text.strip()

                location = "Remote"  
                all_job_data.append({
                    "title": title,
                    "url": url,
                    "date_posted": date_posted,
                    "company": company,
                    "location": location
                })
            except Exception as e:
                print(f"Erreur lors de l'extraction d'une offre : {e}")
                continue

        try:
            next_btn = driver.find_element(
                By.CSS_SELECTOR, "nav[aria-label='pagination'] a.flex.flex-row-reverse"
            )
            next_href = next_btn.get_attribute("href")
            if not next_href or next_href in visited_pages:
                print("Fin de pagination")
                break
            current_url = next_href
            time.sleep(1)
        except:
            print("Pas de bouton 'Next'")
            break

    return all_job_data


In [27]:
def get_job_description(url):
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "article.mb-8.md\\:mb-12.md\\:text-lg"))
        )
        article = driver.find_element(By.CSS_SELECTOR, "article.mb-8.md\\:mb-12.md\\:text-lg")
        return article.text.strip()
    except Exception as e:
        print(f"Erreur de description ({url}) : {e}")
        return ""


In [28]:
all_results = []

for path in paths:
    print(f"\n Scraping catégorie : {path}")
    jobs = get_jobs_from_category(path)
    print(f" {len(jobs)} offres extraites")

    for job in jobs:
        description = get_job_description(job["url"])
        all_results.append([
            job["title"],
            job["company"],
            job["date_posted"],
            job["location"], 
            job["url"],
            description
        ])
        print(f" {job['title']} - {job['company']}")



 Scraping catégorie : /jobs/countries/oman/aws
20 offres trouvées sur la page
20 offres trouvées sur la page
20 offres trouvées sur la page
20 offres trouvées sur la page
20 offres trouvées sur la page
Fin de pagination
 100 offres extraites
 Data Architect/Sr Data Architect - Innovative Solutions
 Senior Data Engineer - vidIQ
 Principal, Cloud Operations infosec engineer - Syniti
 Senior DevOps Engineer - S-PRO
 Senior DevOps Engineer - Moovx
 AWS Engineer with Fortinet - teamative
 Senior Software Engineer, Backend Systems & Infrastructure - Upstart 13
 Senior Cloud Architect, ML/AI (Ukraine) - DoiT International
 Senior DevOps Engineer - CME
 MuleSoft Developer - IO Connect Services
 Senior Backend Engineer - Node.js/Typescript (m/f/x) - DemoUp Cliplister
 Senior Cloud Architect, ML/AI (Poland) - DoiT International
 DevOps Engineer (AWS, Terraform, CI/CD) - iLogos Game Studios
 Senior Cloud Architect, ML/AI (Romania) - DoiT International
 Lead Cloud DevOps Engineer - Halcyon
 Senio

In [30]:
driver.quit()

with open("jobs_himalayas.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Title", "Company", "Date Posted", "Location", "Link", "Full Description"])

    for row in all_results:
        writer.writerow(row)

print(f"\nExtraction terminée : {len(all_results)} offres sauvegardées dans jobs_himalayas.csv")



Extraction terminée : 358 offres sauvegardées dans jobs_himalayas.csv
