# Extraction of Laws References from the quiz questions

In [40]:
import pandas as pd
import json
import os
import re

isLinux = False

default_linux_path = os.getcwd() + "/work/"
default_windows_path = os.getcwd().replace("\\Data", "\\Documents\\Downloaded\\")
default_path = default_linux_path if isLinux else default_windows_path
DEFAULT_SAVE_DIR = default_path.replace("\\Downloaded", "\\Generated")

df = pd.read_csv(DEFAULT_SAVE_DIR + 'quiz_merged.csv')

df.head()


def extract_data(df, regex_patterns, source):
    extracted_data = []
    no_match_data = []

    for i, row in df.iterrows():
        question = row['Question']
        match = None

        for pattern in regex_patterns:
            match = re.findall(pattern, question)
            if match:
                break

        if match:
            # Search for any comma reference
            comma = re.findall(r'comma ([^ ^\.^,]+)', question)
            if comma:
                comma = comma[0].strip()
            else:
                comma = None
            
            # Catch of set of laws not single
            if source == 'D. Lgs.':
                for elem in match:
                    if type(elem) is tuple:
                        reference = elem[0]
                        groupSource = elem[1].replace(' del ', '/')
                    elif type(elem) is str:
                        reference = None
                        groupSource = elem.replace(' del ', '/')                            
                    else:
                        raise Exception(f'Unexpected type {type(elem)}')
                    
                    extracted_data.append({
                        'Source': groupSource,
                        'Comma': comma,
                        'Reference': reference,
                        'Question id': row['Index'],
                    })
            else:
                reference = match[0]
                extracted_data.append({
                    'Source': source,
                    'Comma': comma,
                    'Reference': reference,
                    'Question id': row['Index'],
                })
        else:
            no_match_data.append(row.to_dict())

    df_cp = pd.DataFrame(extracted_data)
    df_no_match = pd.DataFrame(no_match_data)

    return df_cp, df_no_match

## Extraction of Penal Code's References

In [41]:
patterns = [
    r'[Aa]rtt?\.( \d+).{0,25}c\.p\.[^p^a]',
    r"(?:articolo|art\.)( ?\d+) del [Cc]odice [Pp]enale",
    r'[Cc]odice penale all\'art. (\d+)'
]

df_cp, df = extract_data(df, patterns, "c.p.")

print("CP found: ", df_cp.shape)
print("Still unmatched: ", df.shape)
df_cp.head()

CP found:  (153, 4)
Still unmatched:  (967, 5)


Unnamed: 0,Source,Comma,Reference,Question id
0,c.p.,,240,224
1,c.p.,,266,225
2,c.p.,,24,226
3,c.p.,,7,227
4,c.p.,,19,228


## Extraction of Administrative Penal Code's References

In [42]:
patterns = [
    r'[Aa]rtt?\.( \d+).{0,30}[Cc]\.p\.a[\.]?',
    r'(?:[Aa]rticolo|[Aa]rtt?\.)( \d+).{0,30} del [Cc]odice del [Pp]rocesso [Aa]mministrativo'
]

df_cpa, df = extract_data(df, patterns, "c.p.a.")

print("CPA found: ", df_cpa.shape)
print("Still unmatched: ", df.shape)
df_cpa.head()

CPA found:  (58, 4)
Still unmatched:  (909, 5)


Unnamed: 0,Source,Comma,Reference,Question id
0,c.p.a.,,80,2
1,c.p.a.,,42,3
2,c.p.a.,,133,4
3,c.p.a.,,7,9
4,c.p.a.,,34,10


## Extraction of Procedural Penal Code's References

In [43]:
patterns = [
    r'[Aa]rtt?\.( \d+).{0,25}[Cc]\.p\.p[\.]?',
    r'articolo( \d+).{0,25}[Cc]\.p\.p[\.]?',
    r'(?:[Aa]rticolo|[Aa]rtt?\.)( \d+).{0,30} del [Cc]odice di [Pp]rocedura [Pp]enale'
]

df_cpp, df = extract_data(df, patterns, "c.p.p.")

print("CPP found: ", df_cpp.shape)
print("Still unmatched: ", df.shape)
df_cpp.head()

CPP found:  (205, 4)
Still unmatched:  (704, 5)


Unnamed: 0,Source,Comma,Reference,Question id
0,c.p.p.,,57,270
1,c.p.p.,,57,271
2,c.p.p.,,59,272
3,c.p.p.,,71,273
4,c.p.p.,,71,274


## Extraction of Costitution's References

In [44]:
patterns = [
    r'[Aa]rtt?\.?( \d+).{0,25}[Cc]ost'
]

df_cost, df = extract_data(df, patterns, "constitution")

print("Costitution found: ", df_cost.shape)
print("Still unmatched: ", df.shape)
df_cost.head()

Costitution found:  (30, 4)
Still unmatched:  (674, 5)


Unnamed: 0,Source,Comma,Reference,Question id
0,constitution,,117,5
1,constitution,,25,461
2,constitution,,117,616
3,constitution,,118,619
4,constitution,,126,631


## Extraction of Legislative Decree's References (NEED TO CATCH ALSO THE REFERENCE ARTICLE)

In [45]:
patterns = [
    r'decreto legislativo n\.( ?\d+ del \d+)',
    r'[Aa]rtt?\.?( \d+).{0,25}D\.Lgs\.(?: n\.)?( \d+\/\d+)',
    r'[Aa]rtt?\.?( \d+).{0,25}D\.Lgs\. n\.( \d+ del \d+)',
    r'D\.Lgs\.(?: n\.)?( \d+\/ ?\d+)',
]

df_dlgs, df = extract_data(df, patterns, "D. Lgs.")

print("Laws found: ", df_dlgs.shape)
print("Still unmatched: ", df.shape)
df_dlgs.head()

Laws found:  (263, 4)
Still unmatched:  (421, 5)


Unnamed: 0,Source,Comma,Reference,Question id
0,33/2013,,5,6
1,50/2016,,23,14
2,165/2001,,34,24
3,165/2001,,33,26
4,165/2001,1.0,16,28


## Extraction of Laws References

In [46]:
patterns = [
    r'[Aa]rtt?\.?( \d+).{0,25}[lL]\. ?n\.( \d+\/\d+)',
    r'[Aa]rtt?\.?( \d+).{0,25}[lL]\.( \d+\/\d+)',
    r'n\.( \d+\/\d+)',
    r'[Ll]egge( \d+\/\d+)',
    r'l\.( \d+\/\d+)',
]

def custom_match(match):
    article = match.group(1)
    comma = match.group(2)
    return article, comma

df_laws, df = extract_data(df, patterns, "Legge")

print("Laws found: ", df_laws.shape)
print("Still unmatched: ", df.shape)
df_laws.head()

Laws found:  (263, 4)
Still unmatched:  (158, 5)


Unnamed: 0,Source,Comma,Reference,Question id
0,Legge,,241/1990,1
1,Legge,,241/1990,7
2,Legge,,"( 9, 400/1988)",8
3,Legge,,"( 16, 241/1990)",11
4,Legge,,"( 3, 241/1990)",12


## Check Missing Rows

In [47]:
# Print Question column of elements that were not matched
for i, row in df.iterrows():
    print(row['Question'])

Nell'espletamento delle procedure semplificate di cui all'art. 36, le stazioni appaltanti garantiscono l'effettiva contendibilità degli affidamenti da parte dei soggetti potenzialmente interessati in aderenza: 
Con riferimento al riesame con esito demolitorio del provvedimento adottato, l'annullamento d'ufficio (art. 21-nonies della l. n.241/1990): 
Svolgere le attività di organizzazione e gestione del personale e di gestione dei rapporti sindacali e di lavoro è una funzione che l'art. 16 del TUPI attribuisce: 
Quanto alla tempistica dell'azione avverso il silenzio delle P.A. (art. 31 Codice processo amm.vo) essa: 
Quale tra i seguenti è uno dei pilastri sui cui si fonda principalmente la riforma del Titolo V della Costituzione operata a partite dal 2001? 
Lo Stato ha competenza, che la Costituzione qualifica esclusiva, nelle materie enumerate al comma 2 dell'art. 117, tra le quali rientra/rientrano: 
Con riferimento alle pronunce giurisdizionali del giudice amministrativo (art. 33 Cod

## Export the data

In [48]:
# Merge the dataframes and clean up the data
df_merged = pd.concat([df_cp, df_cpa, df_cpp, df_cost, df_dlgs, df_laws], ignore_index=True)
df_merged.to_csv(DEFAULT_SAVE_DIR + 'references_merged.csv', index=False)

print(df_merged.shape)
df_merged.head()

(972, 4)


Unnamed: 0,Source,Comma,Reference,Question id
0,c.p.,,240,224
1,c.p.,,266,225
2,c.p.,,24,226
3,c.p.,,7,227
4,c.p.,,19,228
