# Webscrapper for Registro Público de Concesiones

Traditional webscrapping with Beautiful Soup is less useful for aspx websites that expect a user input. This webscraper was built with Selenium intenting to make requests to the form before scraping each page.

Following links are helpful: <br>
Data Source: https://avancedigital.mineco.gob.es/espectro/Paginas/registro-publico-concesiones.aspx <br>
Selenium: https://selenium-python.readthedocs.io/locating-elements.html <br>
Chromedriver: https://chromedriver.chromium.org/home (download necessary)<br>

In [None]:
#pip install requests (for making HTTP requests)
#pip install URLLib3 (URL handling)
#pip install bs4 (in case Selenium couldn’t handle everything)
#pip install selenium (for browser-based navigation)

## Loading libraries and preparing dataframe

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import math

In [None]:
#Create a dataframe to store all of the scraped table data
df = pd.DataFrame(columns = ["Referencia","Titular","NIF/CIF","Domicilio social","Localidad","Provincia","C. Postal",
                             "F. Concesion","F. Caducidad","Susceptible cesion","Susceptible mutualizacion",
                             "Obtenido por transferencia"])

## Making crawler move on the website

In [None]:
# Establish chrome driver and go to report site URL
url = "https://sedeaplicaciones.minetur.gob.es/RPC_Consulta/FrmConsulta.aspx"
driver = webdriver.Chrome('C:/Users/jthoma/Downloads/chromedriver.exe') #use path where chormedriver saved
driver.get(url)

#Select Servicio Fijo on Registro Público
serviciofijo_select = driver.find_element(By.XPATH,'//*[@id="MainContent_rblTipoServicio_1"]')
serviciofijo_select.click()

## Scrapping the tables

- 02 ANDALUCÍA
- 03 ARAGÓN
- 04 CANARIAS
- 05 CANTABRIA
- 06 CASTILLA Y LEÓN
- 07 CASTILLA-LA MANCHA
- 08 CATALUÑA
- 09 CIUDAD AUTÓNOMA DE CEUTA
- 10 CIUDAD AUTÓNOMA DE MELILLA
- 11 COMUNIDAD DE MADRID
- 12 COMUNIDAD FORAL DE NAVARRA
- 13 COMUNITAT VALENCIANA
- 14 EXTREMADURA
- 15 GALICIA
- 16 ILLES BALEARS
- 17 LA RIOJA
- 18 PAÍS VASCO
- 19 PRINCIPADO DE ASTURIAS
- 20 REGIÓN DE MURCIA

In [None]:
for comunidad in range(2,21):
    
    #Select Servicio Fijo on Registro Público
    serviciofijo_select = driver.find_element(By.XPATH,'//*[@id="MainContent_rblTipoServicio_1"]')
    serviciofijo_select.click()
    
    #Select Comunidad on Registro Público
    comunidad_select = driver.find_element(By.XPATH,'//*[@id="MainContent_cmbComunidad"]/option['+str(comunidad)+']')
    comunidad_select.click()

    #Click on Buscar
    buscar_button = driver.find_element(By.XPATH,'//*[@id="MainContent_btnBuscar"]')
    buscar_button.click()
    driver.implicitly_wait(5) #Wait 5 seconds for the load
        
    #Getting the number of pages to scrap
    number_concesiones = driver.find_element(By.XPATH,'//*[@id="MainContent_lblTotal"]')
    pages = math.ceil((int(str.split(number_concesiones.text)[0]))/10)

    
    #Flip through all of the records and save them
    for n in range(2, pages+1):
        for i in range(3):
            try:
                mytable = driver.find_element(By.XPATH,'//*[@id="MainContent_gridConcesiones"]')
                #Read in all of the data into the dataframe
                for row in mytable.find_elements(By.CSS_SELECTOR,'tr'):
                    row_list = []
                    #Add to dataframe accordingly
                    for cell in row.find_elements(By.CSS_SELECTOR,'td'):
                        cell_reading = cell.text
                        if cell_reading == "":
                          checkbox = cell.find_element(By.CLASS_NAME, 'aspNetDisabled').find_element(By.CSS_SELECTOR,'input')
                          cell_reading = checkbox.get_attribute('checked')
                        row_list.append(cell_reading)
                    #Add the list as a row, if possible 
                    try:
                        a_series = pd.Series(row_list, index = df.columns)
                        df = df.append(a_series, ignore_index=True)
                    except:
                        print("Could not append: " + str(row_list))
                break
            except:
                driver.implicitly_wait(5)
        if n%10 == 1:
            #Click second "..." if on greater than page 10
            if n < 20:
                driver.find_elements(By.XPATH,'//*[@id="MainContent_gridConcesiones"]/tbody/tr[12]/td/table/tbody/tr/td[11]/a')[0].click()  
            else:
                driver.find_elements(By.XPATH,'//*[@id="MainContent_gridConcesiones"]/tbody/tr[12]/td/table/tbody/tr/td[12]/a')[0].click()  
        else:
            driver.find_element(By.XPATH,"//td/a[text()='" + str(n)+ "']").click()    
        #Wait three seconds so the website doesn't crash
        driver.implicitly_wait(3)
        
    
    #go back to selecting comunidad
    nueva_busqueda = driver.find_element(By.XPATH,'//*[@id="MainContent_btnNuevaBusqueda"]')
    nueva_busqueda.click()
    driver.implicitly_wait(5) #Wait 5 seconds for the load

#Click on Buscar
buscar_button = driver.find_element(By.XPATH,'//*[@id="MainContent_btnBuscar"]')
buscar_button.click()

driver.implicitly_wait(10) #Wait 10 seconds for the load

## Scrapping the tables

import pandas as pd
import math


#Create a dataframe to store all of the scraped table data
df = pd.DataFrame(columns = ["Referencia","Titular","NIF/CIF","Domicilio social","Localidad","Provincia","C. Postal",
                             "F. Concesion","F. Caducidad","Susceptible cesion","Susceptible mutualizacion",
                             "Obtenido por transferencia"])

#Getting the number of pages to scrap
number_concesiones = driver.find_element(By.XPATH,'//*[@id="MainContent_lblTotal"]')
pages = math.ceil((int(str.split(number_concesiones.text)[0]))/10)

#Flip through all of the records and save them
for n in range(2, pages+1):
    for i in range(3):
        try:
            mytable = driver.find_element(By.XPATH,'//*[@id="MainContent_gridConcesiones"]')
            #Read in all of the data into the dataframe
            for row in mytable.find_elements(By.CSS_SELECTOR,'tr'):
                row_list = []
                #Add to dataframe accordingly
                for cell in row.find_elements(By.CSS_SELECTOR,'td'):
                    cell_reading = cell.text
                    if cell_reading == "":
                      checkbox = cell.find_element(By.CLASS_NAME, 'aspNetDisabled').find_element(By.CSS_SELECTOR,'input')
                      cell_reading = checkbox.get_attribute('checked')
                    row_list.append(cell_reading)
                #Add the list as a row, if possible 
                try:
                    a_series = pd.Series(row_list, index = df.columns)
                    df = df.append(a_series, ignore_index=True)
                except:
                    print("Could not append: " + str(row_list))
            break
        except:
            driver.implicitly_wait(5)
    if n%10 == 1:
        #Click second "..." if on greater than page 10
        if n < 20:
            driver.find_elements(By.XPATH,'//*[@id="MainContent_gridConcesiones"]/tbody/tr[12]/td/table/tbody/tr/td[11]/a')[0].click()  
        else:
            driver.find_elements(By.XPATH,'//*[@id="MainContent_gridConcesiones"]/tbody/tr[12]/td/table/tbody/tr/td[12]/a')[0].click()  
    else:
        driver.find_element(By.XPATH,"//td/a[text()='" + str(n)+ "']").click()    
    #Wait three seconds so the website doesn't crash
    driver.implicitly_wait(3)

In [None]:
df

In [None]:
#Write to a csv
df.to_csv("RegistroPublicoConcesiones_General.csv", index= False)