In [8]:
!pip install pandas pyarrow selenium

Defaulting to user installation because normal site-packages is not writeable


In [9]:
import json
import re
import time

import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [10]:
file_path = './final_data/surnames/surname.csv'

HOMEPAGE = "https://portale.fnomceo.it/cerca-prof/index.php"

data = []

index_pointer = 0

try:
    with open('last_index.txt', 'r') as file:
        content = file.read()
        index_pointer = int(content)
except FileNotFoundError:
    print("File 'last_index.txt' not found. Using default index_pointer value.")
    
name_of_file = f"fnomceo_data_{index_pointer}"    
    



In [11]:
def get_data(url, df):
    # PROXY = "192.111.139.163:19404"
    browser_options = ChromeOptions()
    browser_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    browser_options.add_experimental_option('useAutomationExtension', False)
    browser_options.add_argument('--disable-blink-features=AutomationControlled')
    # browser_options.add_argument("--proxy-server=https://%s" % PROXY)
    driver = Chrome(options=browser_options)

    driver.get(url)

    wait = WebDriverWait(driver, 10)
    global index_pointer

    for outer_index, row in df.iloc[index_pointer:].iterrows():
        sure_name_input = row["surname"]
        # sure_name_input = "abate"
        time.sleep(1)
        if index_pointer%5 == 0:
            time.sleep(3)
        index_pointer += 1
        print({"name":sure_name_input, "pointer":index_pointer, "index":outer_index})

        sure_name = wait.until(EC.presence_of_element_located((By.ID, "cognomeID")))

        # sure_name = driver.find_element(By.ID, "cognomeID")
        search = driver.find_element(By.ID, "submitButtonID")

        sure_name.send_keys(sure_name_input)

        search.click()

        WebDriverWait(driver, 10).until(
            lambda driver: driver.execute_script('return document.readyState') == 'complete')

        table_data = driver.execute_script("""
        var table = $('#dataTableID').DataTable();
        var data = table.rows().data();
        return  JSON.stringify(data);
        """)

        parsed_data = json.loads(table_data)

        del parsed_data["context"]
        del parsed_data["selector"]
        del parsed_data["length"]
        del parsed_data["ajax"]

        keys_list = list(parsed_data.keys())

        for internal_data_index, index in enumerate(keys_list):
            user = {}
            selected_row = parsed_data[index]
            selected_row_id = selected_row[0]

            user["person_id"] = selected_row_id
            user["surname"] = selected_row[1]
            user["first_name"] = selected_row[2]

            parts = selected_row[3].split()
            user["date_of_birth"] = parts[0]
            user['birth_place'] = ' '.join(parts[1:])

            user["province"] = selected_row[4].split("Ordine della Provincia di")[1]

            script = "return await $.post('https://portale.fnomceo.it/cerca-prof/dettaglio.php', {{id: {}}})".format(
                selected_row_id)
            
            if internal_data_index %5 ==0:
                time.sleep(1)

            data_internal_html = driver.execute_script(script);
            soup = BeautifulSoup(data_internal_html, 'html.parser')
            soup_full_name = soup.find('h4', class_='modal-title text-uppercase d-print-block')

            if soup_full_name:
                extracted_full_name = soup_full_name.getText(strip=True)
                user["full_name"] = extracted_full_name
                user["prefix"] = extracted_full_name.split(" ")[0]

            mult_iscrizioni = []
            mult_lauree = []
            mult_abilitazioni = []
            mult_specializzazioni = []
            mult_elenchi_speciali = []
            ul_element = soup.find('ul', class_="list-group")
            if ul_element:
                for li_element in ul_element.find_all('li'):
                    li_text_value = li_element.get_text(strip=True)
                    li_name = li_element.find('span', class_='badge').get_text(strip=True)
                    li_text_value = li_text_value.replace(li_name, "")
                    formatted_value = re.sub(' +', ' ', li_text_value)
                    if li_name:
                        if li_name == "iscrizioni":
                            iscrizioni_data = {}
                            registrations = formatted_value.replace("\"", "").replace("'", "").split(" - ")
                            registration_year = registrations[0].split(" ")[-1]
                            # prop = registrations[1].split("Albo Provinciale dei Medici Chirurghi di")[1]
                            pattern = r'Ordine della Provincia di (\S+)'
                            match = re.search(pattern, formatted_value)
                            if match:
                                registration_province = match.group(1).replace(")", "")
                            second_prop = formatted_value.split("(")
                            registration_number = ''.join(re.findall(r'\d', second_prop[1]))
                            iscrizioni_data["province"] = registration_province
                            iscrizioni_data["year"] = registration_year
                            iscrizioni_data["number"] = registration_number
                            mult_iscrizioni.append(iscrizioni_data)
                        elif li_name == "lauree":
                            lauree_data = {}
                            degree = formatted_value.replace("\"", "").replace("'", "").split(" - ")
                            degree_year = degree[0].split(" ")[-1]
                            degree_name_with_uni = degree[1]
                            pattern = r'\((.*?)\)'
                            match = re.search(pattern, degree_name_with_uni)
                            if match:
                                lauree_data["university_name"] = match.group(1)
                            lauree_name = degree_name_with_uni.split("(")[0]
                            lauree_data["name"] = lauree_name
                            lauree_data["year"] = degree_year
                            mult_lauree.append(lauree_data)
                        elif li_name == "abilitazioni":
                            abilitazioni_data = {}
                            qualification = formatted_value.replace("\"", "").replace("'", "").split(" - ")
                            qualification_data = qualification[0].split("/")
                            qualification_year = qualification_data[0]
                            qualification_round = qualification_data[1]
                            qualification_name_with_uni = qualification[1]
                            pattern = r'\((.*?)\)'
                            match = re.search(pattern, qualification_name_with_uni)
                            if match:
                                abilitazioni_data["university_name"] = match.group(1)
                            qualification_name = qualification_name_with_uni.split("(")[0]
                            abilitazioni_data["name"] = qualification_name
                            abilitazioni_data["year"] = qualification_year
                            abilitazioni_data["round"] = qualification_round
                            mult_abilitazioni.append(abilitazioni_data)
                        elif li_name == "specializzazioni":
                            specializzazioni_data = {}
                            specializzazioni = formatted_value.replace("\"", "").replace("'", "").split(" - ")
                            specializzazione_year = specializzazioni[0].split(" ")[-1]
                            specializzazione_name_with_uni = specializzazioni[1]
                            pattern = r'\((.*?)\)'
                            match = re.search(pattern, specializzazione_name_with_uni)
                            if match:
                                specializzazioni_data["university_name"] = match.group(1)
                            specializzazione_name = specializzazione_name_with_uni.split("(")[0]
                            specializzazioni_data["specializzazione_name"] = specializzazione_name
                            specializzazioni_data["year"] = specializzazione_year
                            mult_specializzazioni.append(specializzazioni_data)
                        elif li_name == "elenchi speciali":
                            mult_elenchi_data = {}
                            name = formatted_value.split("TITOLO FORMAZIONE ")[-1]
                            mult_elenchi_data[name] = name
                            mult_elenchi_speciali.append(mult_elenchi_data)

                        if mult_iscrizioni:
                            user["iscrizioni"] = mult_iscrizioni
                        if mult_lauree:
                            user["lauree"] = mult_lauree
                        if mult_abilitazioni:
                            user["abilitazioni"] = mult_abilitazioni
                        if mult_specializzazioni:
                            user["specializzazioni"] = mult_specializzazioni
                        if mult_elenchi_speciali:
                            user["elenchi_speciali"] = mult_elenchi_speciali

            last_updated = soup.find('p', class_="small text-muted")
            if last_updated:
                user['last_update_date'] = last_updated.text.split("Data aggiornamento: ")[1]

            if user:
                data.append(user)
                 
        print(f"Index: {outer_index} Name:{sure_name_input} Total_Data: {len(data)}")

        try:
            back_link = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//a[@class='nav-link' and text()='Nuova ricerca']")))

            driver.execute_script("arguments[0].click();", back_link)
        except Exception as e:
            print(f"Error: {e}")


In [12]:
def export_data(data,name):
    df = pd.DataFrame(data)
    df.to_csv(f"{name}.csv", index=False)
    with open(f"{name}.json", 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
    print(df)  # DEBUG

In [13]:
def main():
    try:
        start_time = time.time()
        df = pd.read_csv(file_path)
        get_data(url=HOMEPAGE, df=df)
        end_time = time.time()
        total_time = end_time - start_time
    except Exception as e:
        print(f"Error: {e}")
    finally:
        export_data(data,name_of_file)
        with open('last_index.txt', 'w') as file:
            file.write(str(index_pointer))
        # print(f"DONE Total time taken: {total_time} seconds")
        print("The last index was",index_pointer)


In [14]:
while True:
    main()

{'name': 'Sermarini ', 'pointer': 36281, 'index': 36280}
Index: 36280 Name:Sermarini  Total_Data: 3
{'name': 'Sermattei ', 'pointer': 36282, 'index': 36281}
Index: 36281 Name:Sermattei  Total_Data: 3
{'name': 'Sermenghi ', 'pointer': 36283, 'index': 36282}
Index: 36282 Name:Sermenghi  Total_Data: 3
{'name': 'Sermi ', 'pointer': 36284, 'index': 36283}
Index: 36283 Name:Sermi  Total_Data: 4
{'name': 'Sermini ', 'pointer': 36285, 'index': 36284}
Index: 36284 Name:Sermini  Total_Data: 4
{'name': 'Sermon ', 'pointer': 36286, 'index': 36285}
Index: 36285 Name:Sermon  Total_Data: 4
{'name': 'Sernese ', 'pointer': 36287, 'index': 36286}
Index: 36286 Name:Sernese  Total_Data: 4
{'name': 'Sernesi ', 'pointer': 36288, 'index': 36287}
Index: 36287 Name:Sernesi  Total_Data: 6
{'name': 'Sernicola ', 'pointer': 36289, 'index': 36288}
Index: 36288 Name:Sernicola  Total_Data: 11
{'name': 'Seroni ', 'pointer': 36290, 'index': 36289}
Index: 36289 Name:Seroni  Total_Data: 12
{'name': 'Serotti ', 'pointer'

KeyboardInterrupt: 