In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd

In [None]:
driver = webdriver.Chrome()
driver.get("https://webgate.ec.europa.eu/fleet-europa/search_en")

wait = WebDriverWait(driver, 1)

# Hacer clic en el botón "EU"
eu_option = wait.until(EC.element_to_be_clickable((By.XPATH, "//label[@for='countryType1']")))
eu_option.click()

# Hacer click en "All Vessels"
all_vessels_option = wait.until(EC.element_to_be_clickable((By.XPATH, "//label[@for='period1']")))
all_vessels_option.click()

# Hacer clic en el botón "Search"
btn_search = wait.until(EC.element_to_be_clickable((By.XPATH, "//button/span[text()='Search']")))
btn_search.click()


#-----------------------------------------------------------------------------------------------------#

# Esperar a que el selector de la página de resultados se muestre
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "select2-selection--single")))

page_size_selector = driver.find_element(By.CLASS_NAME, "select2-selection--single")
page_size_selector.click()

# Esperar a que la lista con opciones sea visible
wait.until(EC.presence_of_element_located((By.XPATH, "//ul[@class='select2-results__options']")))

# Intentar hacer clic en la opción que contiene el texto "100"
page_size_100 = wait.until(EC.element_to_be_clickable((By.XPATH, "//li[contains(text(),'100')]")))

# Hacer clic en la opción "100"
page_size_100.click()
#-----------------------------------------------------------------------------------------------------#
# Esperar a que la tabla cargue
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "table-header-container")))

# Nº de resultados por página 100
page_size_selector = driver.find_element(By.CLASS_NAME, "select2-selection--single")
page_size_selector.click()

# Extraer los nombres de las columnas 
column_headers = driver.find_elements(By.XPATH, "//div[@class='table-header-container']/span")
column_names = [header.text.strip() for header in column_headers] 

# Contenido de las filas
rows = driver.find_elements(By.XPATH, "//table/tbody/tr")

data = []

while True:
    # Extraer filas de la tabla
    rows = driver.find_elements(By.XPATH, "//table/tbody/tr")
    
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")
        row_data = [cell.text.strip() for cell in cells]  
        data.append(row_data)

    # Intentar encontrar el botón "Next"
    try:
        # Buscar el botón "Next" usando el atributo aria-label
        next_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@aria-label='Go to next page']")))

        # Desplazar hasta el botón "Next"
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        
        # Verificar si el botón "Next" está habilitado
        if "disabled" not in next_button.get_attribute("class"):  
            # Hacer clic en el botón "Next"
            next_button.click()
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))  # Esperar a que la nueva página cargue
        else:
            break  # Si el botón está deshabilitado, salir del bucle
    except:
        break  # Si no hay botón "Next" o hubo un error, salir del bucle
   

# Crear DataFrame con todos los datos recopilados
df = pd.DataFrame(data, columns=column_names)

# Cerrar el driver
driver.quit()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185354 entries, 0 to 185353
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Flag                 185354 non-null  object
 1   Vessel Type          185354 non-null  object
 2   CFR                  185354 non-null  object
 3   Event Code           185354 non-null  object
 4   Event Date           185354 non-null  object
 5   External Marking     185354 non-null  object
 6   Vessel Name          185354 non-null  object
 7   Ref. Tonnage         185354 non-null  object
 8   Ref. Length          185354 non-null  object
 9   Main Power           185354 non-null  object
 10  IRCS                 185354 non-null  object
 11  UVI                  185354 non-null  object
 12  Highest Error Level  185354 non-null  object
 13  Reception Timestamp  185354 non-null  object
dtypes: object(14)
memory usage: 19.8+ MB


In [5]:
df.head()

Unnamed: 0,Flag,Vessel Type,CFR,Event Code,Event Date,External Marking,Vessel Name,Ref. Tonnage,Ref. Length,Main Power,IRCS,UVI,Highest Error Level,Reception Timestamp
0,BEL,FX,BEL000021964,DES,27/02/2023,O.2,MIKE MICHEL JR,53.0,21.39,213.0,OPAB,8523448.0,ERR,01/03/2023 14:47:06
1,BEL,TU,BEL000041982,RET,15/11/1996,BOU 4,ASTRID,15.67,13.85,79.0,OPAD,,,09/09/2019 12:50:19
2,NLD,TU,BEL000041982,MOD,07/06/2024,BR-9,BOURIC,14.0,14.36,83.0,PA2442,,,07/06/2024 23:18:25
3,BEL,FX,BEL000061930,DES,30/08/1996,BOU 6,ANJA,29.79,16.62,103.0,OPAF,,,09/09/2019 12:50:19
4,BEL,TU,BEL000071985,EXP,24/04/2018,BOU.7,DE ENIGE ZOON,57.0,19.1,219.0,OPAG,,MAJ,09/09/2019 12:50:19


In [6]:
df.to_csv("./data/resultados_fleet.csv", index=False, encoding='utf-8')

In [7]:
driver

<selenium.webdriver.chrome.webdriver.WebDriver (session="2069b2060e15c48ebcb41db9af2d08d8")>