In [14]:
import os
import json
from bs4 import BeautifulSoup

# Nome del file di input JSON
input_file = "input_file.json"
# Directory di output per i file .txt
output_dir = "output_tables"

# Crea la directory di output se non esiste
os.makedirs(output_dir, exist_ok=True)

# Testo introduttivo da aggiungere prima della tabella
intro_text = """Consider the claims being in the form |{Specification, Specification, …}, Measure, Outcome|

Claims must be extracted according to the following format:
|{Specification, Specification, …}, Measure, Outcome|
Specification: |name, value| pair describing the details of an experiment
E.g.: |dataset, Spider|, or |LLM, Llama27b|
Measure: metric or measure used to evaluate the experiment
E.g.: F1-measure
Outcome: outcome value related to metric 
E.g.: 0.89

The format have to follows this examples:
Claim 0: |{|Model Type, General LLM|, |Model Name, ChatGPT-3.5-turbo|, |Parameter Size, 175B|, |Dataset, Spider dev|, |Difficulty Level, 1|}, Execution Match , 0.760|
Claim 1:| .... |

Extract all the claims, even with empty value for the measure in that exact form from the following html table:
"""

# Frase da aggiungere per il contesto della caption
caption_context_text = "\nKnow that the context where the table was mentioned is the following:\n\n"

# Frase da aggiungere per le references

# Funzione per creare la tabella HTML e rimuovere campi inutili
def build_clean_html_table(data):
    soup = BeautifulSoup("<table></table>", "html.parser")
    table = soup.table

    for row in data:
        tr = soup.new_tag("tr")
        for cell in row:
            # Parse cell HTML
            cell_soup = BeautifulSoup(cell, "html.parser")
            td = cell_soup.find("td")

            if td:
                # Mantieni solo il contenuto di <td>, rimuovi attributi non necessari
                clean_td = soup.new_tag("td")
                clean_td.string = td.get_text(strip=True)
                tr.append(clean_td)
            else:
                # Gestione di celle non valide (opzionale)
                clean_td = soup.new_tag("td")
                clean_td.string = "N/A"
                tr.append(clean_td)
        table.append(tr)

    return str(soup)  # Restituisce la stringa HTML senza prettify

# Funzione per processare le tabelle nel file JSON
def process_json_tables(input_path, output_path):
    try:
        # Leggi il file JSON originale
        with open(input_path, "r", encoding="utf-8") as file:
            data = json.load(file)

        for table_id, table_content in data.items():
            if "table" in table_content and isinstance(table_content["table"], list):
                # Applica build_clean_html_table al campo "table"
                html_table = build_clean_html_table(table_content["table"])

                # Recupera la caption, se presente
                caption = table_content.get("caption", "").strip()

                 # Recupera e pulisce le references, se presenti
                references = table_content.get("references", [])
                references = [ref.strip() for ref in references if ref.strip()]  # Rimuovi spazi inutili e riferimenti vuoti
                references_text = "\n".join(references) if references else ""

                # Nome del file di output per questa tabella
                output_file = os.path.join(output_path, f"{table_id}.txt")

                # Scrivi il testo introduttivo, la tabella HTML, la caption e le references in un file separato
                with open(output_file, "w", encoding="utf-8") as outfile:
                    outfile.write(intro_text + "\n" + html_table + "\n")
                    if caption:  # Aggiungi la caption solo se esiste
                        outfile.write(caption_context_text + caption)
                    if references:  # Aggiungi le references solo se esistono
                        outfile.write("\n\n" + references_text)
                
                print(f"Tabella elaborata e salvata in: {output_file}")
            else:
                print(f"Avviso: Nessuna tabella valida trovata per ID {table_id}.")
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Errore durante l'elaborazione del file JSON: {e}")

# Applica la funzione
process_json_tables(input_file, output_dir)


Tabella elaborata e salvata in: output_tables\2405.17129_S4.T1.txt
Tabella elaborata e salvata in: output_tables\2405.17129_A6.T4.txt
Tabella elaborata e salvata in: output_tables\2405.17129_A7.T5.txt
