In [3]:
import os
import re
import json
import pandas as pd

isLinux = True
default_linux_path = os.path.join(os.getcwd().replace("/Data", "/Documents/Downloaded")) if "/Data" in os.getcwd() else os.path.join(os.getcwd(), "Documents", "Downloaded")
default_windows_path = os.path.join(os.getcwd().replace("\\Data", "\\Documents\\Downloaded")) if "\\Data" in os.getcwd() else os.path.join(os.getcwd(), "Documents", "Downloaded")
default_path = default_linux_path if isLinux else default_windows_path

DEFAULT_SAVE_DIR = default_path.replace("/Downloaded", "/Generated") if isLinux else default_path.replace("\\Downloaded", "\\Generated")
SENTENZE_JSONL = os.path.join(default_linux_path, 'corte_giacomo_rulings.jsonl')
OUTPUT_JSONL = os.path.join(DEFAULT_SAVE_DIR, 'output_corte_rulings.jsonl')
MISSING_JSONL = os.path.join(DEFAULT_SAVE_DIR, 'missing_corte_rulings.jsonl')

output_dataset = []
with open(SENTENZE_JSONL, 'r') as file:
    for line in file:
        output_dataset.append(json.loads(line.strip()))

df = pd.DataFrame(output_dataset)
print(df.head())

    judg_id                                           epigrafe  \
0  161/1982  Nei giudizi di legittimità costituzionale dell...   
1    9/1965  Nei giudizi riuniti di legittimità costituzion...   
2    2/2008  Nei giudizi di legittimità costituzionale dell...   
3  176/1982  Nei giudizi riuniti di legittimità costituzion...   
4   73/1990  Nel giudizio di legittimità costituzionale del...   

                                   ritenuto_in_fatto  \
0  . 1.1. - Con ricorso, notificato il 21 gennaio...   
1  . 1. - Nel corso di un procedimento penale a c...   
2  1. – Il Tribunale di Perugia in composizione m...   
3  . 1. - Con tre ordinanze sostanzialmente coinc...   
4  . 1. - Nel corso di un giudizio amministrativo...   

                              considerato_in_diritto  \
0  . 16.1. - Nelle motivazioni delle ordinanze de...   
1  . 1. - Tanto l'ordinanza del Pretore di Lendin...   
2                                               None   
3  . 1. - Le tre ordinanze del Consiglio d

In [4]:
# Function to extract laws references from text
def extract_law_references(text):
    if not text:
        return []
        
    patterns = [
    # Basic Law Reference (Article + Law Number + Year) DONT TOUCH
    r'art\. (\d+).{0,15}legge (.{5,15}).{0,8}n\. (\d+)',
    
    # Article + Multiple Commas (Multiple Clauses)
    r'articolo\s*(\d+),\s*commi\s*(?:\w+)\s*(?:e\s*\w+),?\s*(?:del|della)?\s*(d\.?l\.?|legge|d\.?P\.?R\.?)\s+(\d{1,2}\s+\w+\s+\d{4})\s*n\.\s*(\d+)',
    
    # Short Form Law References (Only Article + Law)
    r'art\.?\s*(\d+)\s*(?:u\.c\.)?\s*legge\s*n\.\s*(\d+)',
    
    # D.P.R. Reference
    r'd\.?P\.?R\.?\s+(\d{1,2}\s+\w+\s+\d{4})\s*n\.\s*(\d+)',
    
    # Constitutional Law Reference
    r'art\.?\s*(\d+)\s*commi\s*(\d+)\s*e\s*(\d+)\s*Cost\.?',
    
    # Decree Law (D.L.) with Conversion to Law
    r'd\.?l\.?\s+(\d{1,2}\s+\w+\s+\d{4})\s*n\.\s*(\d+)\s*,\s*conv\.\s*in\s*legge\s+(\d{1,2}\s+\w+\s+\d{4})\s*n\.\s*(\d+)',
    
    # References to Multiple Articles
    r'artt?\.?\s*(\d+(?:,\s*\d+)*\s*(?:e\s*\d+)?)\s*legge\s+(\d{1,2}\s+\w+\s+\d{4})\s*n\.\s*(\d+)'
    
    # Others
    r''
    ]

    references = []
    
    for pattern in patterns:
        matches = re.finditer(pattern, text)
        for match in matches:
            # Extracting the various parts of the match
            if pattern == patterns[0]:
                # Basic Law Reference
                art = match.group(1)
                anno = match.group(2)
                num = match.group(3)
                organo = 'stato'
                references.append({'art': art, 'comma': None, 'num': num, 'anno': anno, 'organo': organo})
            
            elif pattern == patterns[1]:
                # Article + Multiple Commas
                art = match.group(1)
                comma = 'Multiple'
                organo = match.group(2)
                anno = match.group(3)
                num = match.group(4)
                references.append({'art': art, 'comma': comma, 'num': num, 'anno': anno, 'organo': organo})
            
            elif pattern == patterns[2]:
                # Short Form Law Reference
                art = match.group(1)
                num = match.group(2)
                organo = 'stato'
                references.append({'art': art, 'comma': None, 'num': num, 'anno': None, 'organo': organo})
            
            elif pattern == patterns[3]:
                # D.P.R. Reference
                anno = match.group(1)
                num = match.group(2)
                organo = 'd.P.R.'
                references.append({'art': None, 'comma': None, 'num': num, 'anno': anno, 'organo': organo})
            
            elif pattern == patterns[4]:
                # Constitutional Law Reference
                art = match.group(1)
                comma = f'{match.group(2)} e {match.group(3)}'
                organo = 'Cost'
                references.append({'art': art, 'comma': comma, 'num': None, 'anno': None, 'organo': organo})
            
            elif pattern == patterns[5]:
                # Decree Law with Conversion
                art = None
                anno = match.group(1)
                num = match.group(2)
                organo = 'd.l.'
                references.append({'art': art, 'comma': None, 'num': num, 'anno': anno, 'organo': organo})
                # Conversion part
                conv_anno = match.group(3)
                conv_num = match.group(4)
                references.append({'art': art, 'comma': None, 'num': conv_num, 'anno': conv_anno, 'organo': 'legge'})
            
            elif pattern == patterns[6]:
                # Multiple Articles
                art = match.group(1)
                anno = match.group(2)
                num = match.group(3)
                organo = 'stato'
                references.append({'art': art, 'comma': None, 'num': num, 'anno': anno, 'organo': organo})
    
    return references

# Function to process each JSONL record
def process_record(record, missing_fields_list):
    # Extract fields and parse law references
    epigrafe_refs = extract_law_references(record.get("epigrafe", ""))
    ritenuto_in_fatto_refs = extract_law_references(record.get("ritenuto_in_fatto", ""))
    considerato_in_diritto_refs = extract_law_references(record.get("considerato_in_diritto", ""))
    decisione_refs = extract_law_references(record.get("decisione", ""))
    
    # Check if any field is missing references
    if not epigrafe_refs:
        missing_fields_list.append((record.get("judg_id"), "epigrafe"))
    if not ritenuto_in_fatto_refs:
        missing_fields_list.append((record.get("judg_id"), "ritenuto_in_fatto"))
    if not considerato_in_diritto_refs:
        missing_fields_list.append((record.get("judg_id"), "considerato_in_diritto"))
    if not decisione_refs:
        missing_fields_list.append((record.get("judg_id"), "decisione"))
    
    # Return the processed record
    processed_record = {
        "judg_id": record.get("judg_id"),
        "epigrafe": epigrafe_refs,
        "ritenuto_in_fatto": ritenuto_in_fatto_refs,
        "considerato_in_diritto": considerato_in_diritto_refs,
        "decisione": decisione_refs
    }
    
    return processed_record

# Main function to read JSONL, process, and write output
def process_jsonl_file(input_file, output_file, missing_file):
    missing_fields_list = []
    
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            record = json.loads(line.strip())
            processed_record = process_record(record, missing_fields_list)
            json.dump(processed_record, outfile)
            outfile.write("\n")  # Write each processed record as a line

    # Save the missing fields list to a file
    with open(missing_file, 'w', encoding='utf-8') as missing_outfile:
        for judg_id in missing_fields_list:
            missing_outfile.write(f"{judg_id}\n")
    
    # Optionally, return the missing fields list if needed
    return missing_fields_list

# Usage
input_file = SENTENZE_JSONL
output_file = OUTPUT_JSONL
missing_file = MISSING_JSONL

# Run the process
missing_fields = process_jsonl_file(input_file, output_file, missing_file)
