In [1]:
import re
import os
import time
import json
import PyPDF2
import pandas as pd
from tqdm import tqdm
import lxml.etree as ET

from langchain.text_splitter import RecursiveCharacterTextSplitter

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options



## Utility functions

In [3]:
isLinux = True
default_linux_path = os.getcwd().replace("/Data", "/Documents/Downloaded") if "/Data" in os.getcwd() else os.getcwd() + "/Documents/Downloaded"
default_windows_path = os.getcwd().replace("\\Data", "\\Documents\\Downloaded") if "\\Data" in os.getcwd() else os.getcwd() + "\\Documents\\Downloaded"
default_path = default_linux_path if isLinux else default_windows_path

DEFAULT_SAVE_DIR = default_path.replace("/Downloaded", "/Generated") if isLinux else default_path.replace("\\Downloaded", "\\Generated")
CHROME_DRIVERS_PATH = "/home/antonelli2/chromedriver-linux64/chromedriver" if isLinux else "C:\\Users\\giaco\\Downloads\\chromedriver-win64\\chromedriver.exe"
# Unibo: /chromedriver-linux64/chromedriver or /home/antonelli2/chromedriver-linux64/chromedriver
# WSL: /home/giacomo/chromedriver-linux64/chromedriver

COSTITUZIONE_CSV = DEFAULT_SAVE_DIR + ("/Costituzione.csv" if isLinux else "\\Costituzione.csv")

CODICE_PENALE_PDF = default_path + ("/Codice penale well formatted edited.pdf" if isLinux else "\\Codice penale well formatted edited.pdf")
CODICE_PENALE_CSV = DEFAULT_SAVE_DIR + ("/Codice penale.csv" if isLinux else "\\Codice penale.csv")

CPP_CSV = DEFAULT_SAVE_DIR + ("/Codice procedura penale.csv" if isLinux else "\\Codice procedura penale.csv")

CPA_CSV = DEFAULT_SAVE_DIR + ("/Codice processo amministrativo.csv" if isLinux else "\\Codice procedura amministrativo.csv")

REF_MERG = DEFAULT_SAVE_DIR + ('/references_merged.csv' if isLinux else '\\references_merged.csv')
ALREADY_SCRAPED_DLGS_JSON = DEFAULT_SAVE_DIR + ('/scraped_dlgs.json' if isLinux else '\\scraped_dlgs.json')
DLGS_CSV = DEFAULT_SAVE_DIR + ('/dlgs.csv' if isLinux else '\\dlgs.csv')

ALL_ITALIAN_LAWS_CSV = DEFAULT_SAVE_DIR + ("/All Italian laws.csv" if isLinux else "\\All Italian laws.csv")
ALL_ITALIAN_LAWS_SCRAPED_JSON = DEFAULT_SAVE_DIR + ("/All laws.json" if isLinux else "\\All laws.json")

LAWS_CSV = DEFAULT_SAVE_DIR + ("/laws.csv" if isLinux else "\\laws.csv")


# Utility functions and constants
def write_to_file(filename, content):
    with open(filename, 'w+') as f:
        f.write(content)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read()

# JSON stuff
def write_list_to_json(lst, filename):
    with open(filename, 'w') as json_file:
        json.dump(lst, json_file)

def read_list_from_json(filename):
    with open(filename, 'r') as json_file:
        py_list = json.load(json_file)
    return py_list

# CSV stuff
def save_df_to_csv(df, filename):
    df.to_csv(filename, index=False)

def read_df_from_csv(filename):
    return pd.read_csv(filename)

# Different kind of text extraction from each type of file
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text    

def clearLawContent(content):
    # Check for comma class tags
    matches =  re.findall(r'<span class="art_text_in_comma">(.*?)</span>', content, re.DOTALL)
    content = ' '.join(matches) if matches else content
    content = re.sub(r"<div class=\"ins-akn\" eid=\"ins_\d+\">\(\(", "", content, flags=re.DOTALL)
    content = re.sub(r"\n", "", content, flags=re.DOTALL)
    content = re.sub(r"<br>", "", content, flags=re.DOTALL)

    
    # Check for a tags
    aPattern = re.compile(r'<a.*?>(.*?)</a>', re.DOTALL)
    matches = aPattern.findall(content)
    if matches:
        for match in matches:
            content = re.sub(r'<a.*?>.*?</a>', match, content, count=1)

    # Check for span tags
    sPattern = re.compile(r'<span.*?>(.*?)</span>', re.DOTALL)
    matches = sPattern.findall(content)
    if matches:
        for match in matches:
            content = re.sub(r'<span.*?>.*?</span>', match, content, count=1)

    # Check for list div tags
    dlPattern = re.compile(r'<div class="pointedList-rest-akn">(.*?)</div>', re.DOTALL)
    matches = dlPattern.findall(content)
    if matches:
        for match in matches:
            content = re.sub(r'<div class="pointedList-rest-akn">.*?</div>', re.escape(match), content, count=1)
    
    # Delete remaining tags
    content = re.sub(r'<.+?>', "", content)
    content = content.replace("\n", "")
    content = content.replace("((", "")
    content = content.replace("))", "")
    
    return content.strip()

def extractCommaNumber(articleElement):
    articleElement = articleElement.strip()
    if "art." in articleElement:
        return articleElement.split(" ")[1]
    return articleElement

def extractArticleNumber(articleElement):
    match = re.search(r'n\..*?(\d+)', articleElement)
    if match:
        return int(match.group(1))
    raise Exception("Article number not found")

## Extraction of Constitution

In [None]:
class NormattivaCosScraper:
    def __init__(self, driver_path, headless=True):
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--no-sandbox")
        self.service = Service(driver_path)
        self.driver = webdriver.Chrome(service=self.service, options=chrome_options)
    
    # Get the originario version of the law
    def fill_field(self, field_id, value):
        input_field = self.driver.find_element(By.ID, field_id)
        input_field.clear()
        input_field.send_keys(value)
    
    # Get the text of a specific article
    def get_cos_articles(self):
        articles_list = []
        
        # Ensure is multivigente version
        time.sleep(1)
        
        # Get articles
        albero = self.driver.find_element(By.ID, "albero")
        articles = albero.find_elements(By.CLASS_NAME, "numero_articolo")
        #print("Article len: ", len(articles))
                
        for i, article in enumerate(articles):
            # if there is any letter in article skip it
            if any(char.isalpha() for char in article.text):
                continue
            
            time.sleep(1)
            
            # Somehow some crash, just add them by hand
            article.click()
            time.sleep(1)
            try:
                text = self.driver.find_element(By.CLASS_NAME, "art-just-text-akn")
                law_content = text.get_attribute('outerHTML')
                law_content = law_content.replace("\n", "")
            except:
                print("Error in article: ", article.text)
                # continue
                return articles_list
            
            law_number = article.text
            law_content = clearLawContent(law_content)            

            # Clear the output
            law_content = clearLawContent(law_content)
            articles_list.append({ "law_source": "cos",
                                    "year": None,
                                    "law_number": law_number,
                                    "law_text": law_content })
            print(articles_list[-1])
        return articles_list

    # Navigate to a specific page
    def navigate_to_page(self, url):
        self.driver.get(url)
        
    # Close the driver
    def close(self):
        self.driver.quit()

scraper = NormattivaCosScraper(CHROME_DRIVERS_PATH, headless=False)
scraper.navigate_to_page("https://www.normattiva.it/uri-res/N2Ls?urn:nir:stato:costituzione")
articles = scraper.get_cos_articles()

df_cos = pd.DataFrame(articles)
df_cos.to_csv(COSTITUZIONE_CSV, index=False)

## Extraction of Codice Penale from Normattiva

In [13]:
class NormattivaCpScraper:
    def __init__(self, driver_path, headless=True):
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--no-sandbox")
        self.service = Service(driver_path)
        self.driver = webdriver.Chrome(service=self.service, options=chrome_options)
    
    # Get the originario version of the law
    def fill_field(self, field_id, value):
        input_field = self.driver.find_element(By.ID, field_id)
        input_field.clear()
        input_field.send_keys(value)
    
    # Get the text of a specific article
    def get_cp_articles(self):
        articles_list = []
        
        # Ensure is multivigente version
        time.sleep(1)
        
        # Get articles
        albero = self.driver.find_element(By.ID, "albero")
        articles = albero.find_elements(By.CLASS_NAME, "numero_articolo")
        #print("Article len: ", len(articles))
                
        for i, article in enumerate(articles[1:]):
            
            time.sleep(1)
            if "art" in article.text or "Art" in article.text:
                
                article.click()
                time.sleep(1)
                try:
                    text = self.driver.find_elements(By.CLASS_NAME, "attachment-just-text")[0]
                    law_content = text.get_attribute('outerHTML')
                    law_content = law_content.replace("\n", "")
                except:
                    print("Error in article: ", article.text)
                    # continue
                    return articles_list
                
                # Extract the article number
                title_pattern = r'>.+(Art\..+)<'
                match = re.search(title_pattern, law_content, re.DOTALL)
                                
                # If a match is found, print it
                if match:
                    law_number = match.group(1).split("<")[0]
                    law_content = law_content.split(law_number)[1]
                else:
                    print(text)
                    continue
                

                # Clear the output
                law_content = clearLawContent(law_content)
                articles_list.append({ "law_source": "c.p.",
                                       "law_text": law_content,
                                       "law_number": law_number,
                                       "year": None })
                print(articles_list[-1])
            else:
                pass

        return articles_list

    # Navigate to a specific page
    def navigate_to_page(self, url):
        self.driver.get(url)
        
    # Close the driver
    def close(self):
        self.driver.quit()

scraper = NormattivaCpScraper(CHROME_DRIVERS_PATH, headless=False)
scraper.navigate_to_page("https://www.normattiva.it/uri-res/N2Ls?urn:nir:stato:regio.decreto:1930-10-19;1398")
articles = scraper.get_cp_articles()

df_cp = pd.DataFrame(articles)
df_cp.to_csv(CODICE_PENALE_CSV, index=False)

NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


## Extraction of C.P.P. from Normattiva

In [None]:
class NormattivaCppScraper:
    def __init__(self, driver_path, headless=True):
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        self.service = Service(driver_path)
        self.driver = webdriver.Chrome(service=self.service, options=chrome_options)
    
    # Get the originario version of the law
    def fill_field(self, field_id, value):
        input_field = self.driver.find_element(By.ID, field_id)
        input_field.clear()
        input_field.send_keys(value)
    
    def get_cpp_articles(self):
        articles_list = []
        
        # Ensure is multivigente version
        time.sleep(1)
        
        # Get articles
        albero = self.driver.find_element(By.ID, "albero")
        articles = albero.find_elements(By.CLASS_NAME, "numero_articolo")
        #print("Article len: ", len(articles))
                
        for i, article in enumerate(articles[1:]):
            
            time.sleep(1)
            try:
                article.click()
            except:
                continue
            time.sleep(1)
            try:
                law_number = self.driver.find_elements(By.CLASS_NAME, "article-num-akn")[0]
                law_number = law_number.get_attribute('outerHTML')
                law_number = law_number.replace("\n", "")
                law_number = clearLawContent(law_number)
                try:                    
                    law_content = self.driver.find_elements(By.CLASS_NAME, "art-commi-div-akn")[0]
                except:
                    law_content = self.driver.find_elements(By.CLASS_NAME, "art-just-text-akn")[0]
                law_content = law_content.get_attribute('outerHTML')                
                law_content = law_content.replace("\n", "")
                law_content = clearLawContent(law_content)
            except:
                print("Error in article: ", article.text)
                # continue
                return articles_list
            
            # Clear the output
            articles_list.append({ "law_source": "c.p.p.",
                                    "law_text": law_content,
                                    "law_number": law_number,
                                    "year": None })
            print(articles_list[-1])
        else:
            pass

        return articles_list

    # Navigate to a specific page
    def navigate_to_page(self, url):
        self.driver.get(url)
        
    # Close the driver
    def close(self):
        self.driver.quit()

scraper = NormattivaCppScraper(CHROME_DRIVERS_PATH, headless=False)
scraper.navigate_to_page("https://www.normattiva.it/uri-res/N2Ls?urn:nir:stato:decreto.del.presidente.della.repubblica:1988-09-22;447")
articles = scraper.get_cpp_articles()

df_cpp = pd.DataFrame(articles)
df_cpp.to_csv(CPP_CSV, index=False)

## Extraction of C.P.A. from Normattiva

In [None]:
class NormattivaCpaScraper:
    def __init__(self, driver_path, headless=True):
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        self.service = Service(driver_path)
        self.driver = webdriver.Chrome(service=self.service, options=chrome_options)
    
    # Get the originario version of the law
    def fill_field(self, field_id, value):
        input_field = self.driver.find_element(By.ID, field_id)
        input_field.clear()
        input_field.send_keys(value)
    
    def get_cpa_articles(self):
        articles_list = []
        
        # Ensure is multivigente version
        time.sleep(1)
        
        # Get articles
        albero = self.driver.find_element(By.ID, "albero")
        articles = albero.find_elements(By.CLASS_NAME, "numero_articolo")
        #print("Article len: ", len(articles))
                
        for i, article in enumerate(articles[1:]):
            if "art" not in article.text:
                continue
            
            print("Article: ", article.text)
            time.sleep(1)
            try:
                article.click()
            except:
                continue
            time.sleep(1)
            try:
                law_number = article.text[5:]
                                
                try:                    
                    law_content = self.driver.find_element(By.CLASS_NAME, "art-commi-div-akn")
                except:
                    law_content = self.driver.find_element(By.CLASS_NAME, "attachment-just-text")
                law_content = law_content.get_attribute('outerHTML')                
                law_content = law_content.replace("\n", "")
                law_content = clearLawContent(law_content)
            except:
                print("Error in article: ", article.text)
                # continue
                return articles_list
            
            # Clear the output
            articles_list.append({ "law_source": "c.p.a.",
                                    "year": None,
                                    "law_number": law_number,
                                    "law_text": law_content})
            print(articles_list[-1])

        return articles_list

    # Navigate to a specific page
    def navigate_to_page(self, url):
        self.driver.get(url)
        
    # Close the driver
    def close(self):
        self.driver.quit()

scraper = NormattivaCpaScraper(CHROME_DRIVERS_PATH, headless=False)
scraper.navigate_to_page("https://www.normattiva.it/uri-res/N2Ls?urn:nir:stato:decreto.legislativo:2010-07-02;104")
articles = scraper.get_cpa_articles()

df_cpa = pd.DataFrame(articles)
df_cpa.to_csv(CPA_CSV, index=False)

## Extraction of Dlgs from Normattiva

In [None]:
class NormattivaDlgsScraper:
    def __init__(self, driver_path, headless=True):
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        self.service = Service(driver_path)
        self.driver = webdriver.Chrome(service=self.service, options=chrome_options)
    
    # Get the originario version of the law
    def fill_field(self, field_id, value):
        input_field = self.driver.find_element(By.ID, field_id)
        input_field.clear()
        input_field.send_keys(value)
    
    # Get the text of a specific article
    def get_laws(self, numeroProvvedimento, anno):
        articles_list = []
            
        self.fill_field("numeroProvvedimento", numeroProvvedimento)
        self.fill_field("annoProvvedimento", anno)

        self.driver.find_element(By.CSS_SELECTOR, "[type*='submit']").click()
        self.driver.find_elements(By.CSS_SELECTOR, "[title*='Dettaglio atto']")[0].click()
        
        time.sleep(2)
        # Ensure is multivigente version
        multivigente_button = self.driver.find_element(By.XPATH, '//a[contains(@href, "multivigenza")]')
        multivigente_button.click()
        
        # Get articles
        albero = self.driver.find_element(By.ID, "albero")
        articles = albero.find_elements(By.CLASS_NAME, "numero_articolo")
        print("Article len: ", len(articles))
                
        for article in articles:
            if article.text.strip() != "" and article.text[0].isdigit() and "orig" not in article.text and "Allegato" not in article.text and "agg" not in article.text:
                print("In ", article.text)
                try:
                    article.click()
                except:
                    continue
                
                time.sleep(1)
                
                #article_title = self.driver.find_element(By.CLASS_NAME, "article-num-akn").text.strip()
                #commas = self.driver.find_elements(By.CLASS_NAME, "art-comma-div-akn")
                #page_title =  scraper.driver.title
                #law_number = extractArticleNumber(page_title)
                law_number = article.text
                
                try:
                    law_text = self.driver.find_element(By.CLASS_NAME, "art-commi-div-akn").get_attribute('outerHTML')
                except:
                    # Se l'articolo è stato abrogato la pagina è diversa
                    law_text = self.driver.find_element(By.CLASS_NAME, "art-just-text-akn").get_attribute('outerHTML')
                law_text = clearLawContent(law_text)
                
                if law_text == "":
                    continue
                
                articles_list.append({ "law_source": f"Dlgs {numeroProvvedimento}/{anno}".strip(),
                                       "year": None,
                                       "law_number": law_number,
                                       "law_text": law_text})
                
            else:
                print("Out ", article.text)

        return articles_list

    # Navigate to a specific page
    def navigate_to_page(self, url):
        self.driver.get(url)
        
    # Close the driver
    def close(self):
        self.driver.quit()

scraper = NormattivaDlgsScraper(CHROME_DRIVERS_PATH, headless=False)

# Dataframe of already been scraped laws
df_dlgs = pd.DataFrame()
if os.path.exists(DLGS_CSV):
    df_dlgs = pd.read_csv(DLGS_CSV)

# List of already been scraped dlgs
scraped_dlgs = []
if os.path.exists(ALREADY_SCRAPED_DLGS_JSON):
    scraped_dlgs = read_list_from_json(ALREADY_SCRAPED_DLGS_JSON)

df_ref = pd.read_csv(REF_MERG)

for index, row in tqdm(df_ref.iterrows(), total=df_ref.shape[0]): # 445
    # Check if it's a D. lgs.
    if '/' not in row['law_source'] or row['law_source'].strip() in scraped_dlgs:
        continue
    scraped_dlgs.append(row['law_source'])
    
    print(f"Scraping |{row['law_source']}|")
    num, year = row['law_source'].split("/")
    
    # some year references count "2" or "3" like "2002" or "2003"
    year = year if len(year) == 4 else f"200{year.strip()}".strip()
    
    scraper.navigate_to_page("https://www.normattiva.it/ricerca/avanzata")
    
    articles = scraper.get_laws(num, year)
    
    if not articles:
        continue
    
    df_dlgs = pd.concat([df_dlgs, pd.DataFrame(articles)], ignore_index=True)
     
    save_df_to_csv(df_dlgs, DLGS_CSV)
    write_list_to_json(scraped_dlgs, ALREADY_SCRAPED_DLGS_JSON)


## Extraction of all italian's laws from Normattiva

In [5]:
class NormattivaAllLawsScraper:
    def __init__(self, driver_path, headless=True):
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--no-sandbox")
        self.service = Service(driver_path)
        self.driver = webdriver.Chrome(service=self.service, options=chrome_options)
    
    # Get the originario version of the law
    def fill_field(self, field_id, value):
        input_field = self.driver.find_element(By.ID, field_id)
        input_field.clear()
        input_field.send_keys(value)
    
    def get_years(self):
        years = self.driver.find_elements(By.CLASS_NAME, "btn-secondary")
        time.sleep(1)
        return years
        
    # Get the text of a specific article
    def get_articles(self, year, law_number):
        print(f"{year} - {law_number}")
        articles_list = []
        
        # Ensure is multivigente version
        time.sleep(1)
        
        # Get articles
        albero = self.driver.find_element(By.ID, "albero")
        articles = albero.find_elements(By.CLASS_NAME, "numero_articolo")
                
        for i, article in enumerate(articles):
            article_number = article.text.strip()
            
            multiplicatives = ["bis", "ter", "quater", "quinquies", "sexies"]
            if (article_number == "") or ("allegato" in article_number.lower()) or ((not article_number.isdigit()) and ("art" not in article_number) and ("Art" not in article_number) and (not any(multiplicative in article_number for multiplicative in multiplicatives))):
                print("Out ", article.text)
                continue
            
            try:
                time.sleep(1)
                article.click()                    
                time.sleep(1)
            except:                
                continue
            
            text = self.driver.find_elements(By.CLASS_NAME, "art-commi-div-akn")
            if not text:
                text = self.driver.find_elements(By.CLASS_NAME, "art-just-text-akn")
            if not text:
                print("Error in article: ", article.text)
                return articles_list            
                
            text = text[0]
            
            law_content = text.get_attribute('outerHTML')
            law_content = law_content.replace("\n", "")
            law_content = clearLawContent(law_content)
                            
            articles_list.append({ 
                "law_source": f"Legge {law_number}",
                "year": year,
                "law_number": article_number,
                "law_text": law_content
            })

            print(articles_list[-1])

        return articles_list

    # Navigate to a specific page
    def navigate_to_page(self, url):
        self.driver.get(url)
        
    # Close the driver
    def close(self):
        self.driver.quit()
 
# Check if the laws have already been scraped
scraped_pages_set = set()
df_all_laws = pd.DataFrame()
if os.path.exists(ALL_ITALIAN_LAWS_SCRAPED_JSON):
    scraped_pages = read_from_file(ALL_ITALIAN_LAWS_SCRAPED_JSON)
    scraped_pages_set = set(scraped_pages)
    
    df_all_laws = read_df_from_csv(ALL_ITALIAN_LAWS_CSV)

scraper = NormattivaAllLawsScraper(CHROME_DRIVERS_PATH, headless=True)
scraper.navigate_to_page("https://www.normattiva.it/ricerca/elencoPerData")
years = scraper.get_years()
data = []

for year in range(2024, 1861, -1): # from 2024 to 1861
    scraper.driver.get("https://www.normattiva.it/ricerca/avanzata")
    scraper.fill_field("annoProvvedimento", year)
    scraper.driver.find_element(By.CSS_SELECTOR, "[type*='submit']").click()
    time.sleep(0.5)
    
    validPage = True
    curr_page = 1
    law_urls = []
    
    while validPage:
        laws = scraper.driver.find_elements(By.CSS_SELECTOR, "[title^='Dettaglio atto']")
        
        for law in laws:
            law_url = law.get_attribute('href')
            if law_url and "LEGGE" in law.text:                                
                law_urls.append(law_url)
        
        # Try a new page of laws
        curr_page += 1
        pages_link = scraper.driver.find_elements(By.CLASS_NAME, "page-link")
        validPage = False
        for page in pages_link:
            if page.text == str(curr_page):
                validPage = True
                page.click()
                time.sleep(0.5)
                break
        
    # Visit each law's detail page and scrape the articles
    for i, url in enumerate(law_urls):
        if f"{year}/{i}" in scraped_pages_set:
            continue
        scraper.driver.get(url)
        
        page_title = scraper.driver.title
        law_number = extractArticleNumber(page_title)
        
        articles = scraper.get_articles(year, law_number)
        
        # Append individual articles to data, not the entire list
        data.extend(articles)
        
        scraped_pages_set.add(f"{year}/{i}")
        write_to_file(ALL_ITALIAN_LAWS_SCRAPED_JSON, json.dumps(list(scraped_pages_set)))
        
        # Create a temporary DataFrame for the new articles and append to the main DataFrame
        df_tmp = pd.DataFrame(articles)  # Not `data` but `articles`
        df_all_laws = pd.concat([df_all_laws, df_tmp], ignore_index=True)
        
        save_df_to_csv(df_all_laws, ALL_ITALIAN_LAWS_CSV)

# df_all_laws now contains all the data
df_all_laws.head()


SessionNotCreatedException: Message: session not created: Chrome failed to start: exited normally.
  (session not created: DevToolsActivePort file doesn't exist)
  (The process started from chrome location /opt/google/chrome/chrome is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
Stacktrace:
#0 0x5638ee3aaf33 <unknown>
#1 0x5638ee0a2ce6 <unknown>
#2 0x5638ee0d76fa <unknown>
#3 0x5638ee0d36ed <unknown>
#4 0x5638ee11c71c <unknown>
#5 0x5638ee10fc53 <unknown>
#6 0x5638ee0e0db3 <unknown>
#7 0x5638ee0e177e <unknown>
#8 0x5638ee37086b <unknown>
#9 0x5638ee374885 <unknown>
#10 0x5638ee35e181 <unknown>
#11 0x5638ee375412 <unknown>
#12 0x5638ee34225f <unknown>
#13 0x5638ee399528 <unknown>
#14 0x5638ee399723 <unknown>
#15 0x5638ee3aa0e4 <unknown>
#16 0x7fd67f375609 start_thread


## Post all Dlgs extraction -> Merge previous df with the laws.csv file

In [3]:

df_cos = pd.read_csv(COSTITUZIONE_CSV)
#df_cp = pd.read_csv(CODICE_PENALE_CSV)
#df_cpp = pd.read_csv(CPP_CSV)
df_cpa = pd.read_csv(CPA_CSV)
df_dlgs = pd.read_csv(DLGS_CSV)

df_laws = pd.read_csv(LAWS_CSV)

final_df = pd.concat([df_cos, df_cpa, df_laws, df_dlgs], ignore_index=True)

final_df.to_csv(LAWS_CSV, index=False)

In [None]:
import json
with open('/home/antonelli2/Thesis/Documents/Generated/invalid_laws.json', 'r') as file:
    data = json.load(file)