In [58]:
import time
import json
import pandas as pd
from random import randint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

## Configuración

In [59]:
# Selenium configuration and Chrome driver
chrome_options = Options()
# Change the User-Agent to simulate a real browser
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3')
#chrome_options.add_argument("--headless")  # To run the browser without a graphical interface
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

## Inicialización

In [60]:
# Initialize the browser (Chrome)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Access the website
driver.get("https://cl.computrabajo.com/trabajo-de-desarrollador")

# Wait for the page to load
time.sleep(5)

## Extracción de enlaces

In [61]:
def extract_links():
    # Find the JSON content within the <script type="application/ld+json"> tag
    script_element = driver.find_element(By.CSS_SELECTOR, "script[type='application/ld+json']")
    
    # Extract the content of the script
    json_data = script_element.get_attribute("innerHTML")
    
    # Parse the JSON
    data = json.loads(json_data)
        
    # Extract the links from the 'itemListElement' array
    links = []
    
    # Check if the JSON contains the '@graph' key
    if '@graph' in data:
        # Iterate through the elements inside '@graph'
        for item in data['@graph']:
            # If the 'itemListElement' key exists, iterate through it
            if 'itemListElement' in item:
                for element in item['itemListElement']:
                    # If there is a 'url' field, add it to the list of links
                    if 'url' in element:
                        links.append(element['url'])
    
    return links
    
    # Show the extracted links
    #print("Links found:")
    #for link in links:
    #    print(link)

In [62]:
# Function to navigate to the next page
def go_to_next_page():
    try:
        # Find the "Next" button and get the link to the next page
        next_button = driver.find_element(By.XPATH, '//span[@title="Siguiente"]')
        next_page_url = next_button.get_attribute('data-path')  # Get the value of the data-path attribute
        
        if next_page_url:
            driver.get(next_page_url)  # Load the next page using the link
            time.sleep(3)  # Wait a bit for the new page to load
        else:
            print("No more pages.")
            return False
    except Exception as e:
        print(f"Error while navigating to the next page: {e}")
        return False
    return True

In [63]:
all_links = []

for i in range(5):
    all_links = all_links + extract_links()
    go_to_next_page()

In [64]:
print(all_links)

['https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-desarrollador-fullstack-controller-java-springboot-plsql-oracle-aws-kubernetes-en-santiago-providencia-983B6DAEB2C2649361373E686DCF3405', 'https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-desarrollador-web-en-santiago-centro-089EAD6B492D1F4D61373E686DCF3405', 'https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-especialista-desarrollador-proyecto-en-santiago-centro-A7B6C4043CD48E2861373E686DCF3405', 'https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-programador-mobile-androidjava-en-santiago-centro-8FD3EA2B233A51C561373E686DCF3405', 'https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-desarrollador-net-senior-en-santiago-centro-0E25ECDFE5C311D361373E686DCF3405', 'https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-desarrollador-frontend-react-native-home-office-en-santiago-lo-barnechea-CF609DE8D789E17961373E686DCF3405', 'https://cl.c

## Extracción de datos

In [65]:
def extract_data(link):
    # Load the page
    url = link
    driver.get(url)
    
    # Salary
    try:
        span_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//span[@itemprop="baseSalary"]'))
        )
        meta_tag = span_element.find_element(By.XPATH, './/meta[@itemprop="value"]')
        salary_value = meta_tag.get_attribute("content")
    except:
        salary_value = "N/A"
    
    # Company
    span_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//span[@itemprop="hiringOrganization"]'))
    )
    company_tag = span_element.find_element(By.XPATH, './/meta[@itemprop="name"]')
    company_name = company_tag.get_attribute("content")
    
    # Region and commune
    job_location_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//span[@itemprop="jobLocation"]'))
    )
    address_element = job_location_element.find_element(By.XPATH, './/span[@itemprop="address"]')
    address_locality = address_element.find_element(By.XPATH, './/meta[@itemprop="addressLocality"]')
    address_region = address_element.find_element(By.XPATH, './/meta[@itemprop="addressRegion"]')
    region = address_locality.get_attribute("content")
    commune = address_region.get_attribute("content")
    
    # Job title
    title_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/main/span/meta[1]'))
    )
    job_title = title_element.get_attribute('content')

    job_data = {
        "Salario": salary_value,
        "Empresa": company_name,
        "Comuna": commune,
        "Región": region,
        "Cargo": job_title
    }
    
    return job_data


In [66]:
# Create the DataFrame
data = []

for link in all_links:
    job_data = extract_data(link)
    data.append(job_data)
    print(link)
    time.sleep(randint(1027852,2027852)/1000000)

df = pd.DataFrame(data)

https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-desarrollador-fullstack-controller-java-springboot-plsql-oracle-aws-kubernetes-en-santiago-providencia-983B6DAEB2C2649361373E686DCF3405
https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-desarrollador-web-en-santiago-centro-089EAD6B492D1F4D61373E686DCF3405
https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-especialista-desarrollador-proyecto-en-santiago-centro-A7B6C4043CD48E2861373E686DCF3405
https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-programador-mobile-androidjava-en-santiago-centro-8FD3EA2B233A51C561373E686DCF3405
https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-desarrollador-net-senior-en-santiago-centro-0E25ECDFE5C311D361373E686DCF3405
https://cl.computrabajo.com/ofertas-de-trabajo/oferta-de-trabajo-de-desarrollador-frontend-react-native-home-office-en-santiago-lo-barnechea-CF609DE8D789E17961373E686DCF3405
https://cl.computrabajo.com/ofer

## Resultados

In [None]:
# Show the DataFrame
print(df)

In [67]:
# Save the results to a CSV file
df.to_csv("../data/computrabajo/computrabajo.csv", index=False)

# Close the browser
driver.quit()