# LLM Prompt Generator
This script read tables from `input_file.json` and generate a good prompt to send to gpt/gemini/llama/\<insert llm\> ...

In [25]:
import os
import json

# Nome del file di input JSON
input_file = "input_file.json"
# Directory di output per i file .txt
output_dir = "prompts"

# Crea la directory di output se non esiste
os.makedirs(output_dir, exist_ok=True)

# Testo introduttivo da aggiungere prima della tabella
intro_text = """Consider the claims being in the form |{Specification, Specification, …}, Measure, Outcome|

Claims must be extracted according to the following format:
|{Specification, Specification, …}, Measure, Outcome|
Specification: |name, value| pair describing the details of an experiment
E.g.: |dataset, Spider|, or |LLM, Llama27b|
Measure: metric or measure used to evaluate the experiment
E.g.: F1-measure
Outcome: outcome value related to metric 
E.g.: 0.89

The format have to follows this examples:

Suppose the table is mentioned in this context:
Table 1.Benchmark Results of Execution Match of all Models we tested on the "dev" SPIDER dataset
In our experimentation, we organized the models into three distinct groups as illustrated in Table 1: general purpose LLMs, Code-Specific LLMs, and Sequence-to-Sequence models. Table 1 further presents the Execution Match score on the SPIDER dataset for each studied LLM and for each of the four difficulty levels. Note that for all LLMs, we run our experiments with both Type I and Type II prompts (cf. 4.5.2), and we always select best performance. The overall winner is the GPT-4 + DIN approach which emerged as the most effective choice across all General LLMs. Furthermore, when focusing on models with fewer than 7 billion parameters, ALPACA stood out as the top-performing option following prompt optimization.

So for the following html table:
<table border="1"> <thead> <tr> <th>Model Type</th> <th>Model Name</th> <th>Parameter Size</th> <th>Level 1</th> <th>Level 2</th> <th>Level 3</th> <th>Level 4</th> <th>All</th> </tr> </thead> <tbody> <tr> <td rowspan="9">General LLM</td> <td>ChatGPT-3.5-turbo</td> <td>175B</td> <td>0.760</td> <td>0.799</td> <td>0.408</td> <td>0.493</td> <td>0.623</td> </tr> <tr> <td>DIN-SQL+GPT-4</td> <td>1.76T</td> <td>0.861</td> <td>0.866</td> <td>0.700</td> <td>0.654</td> <td><b>0.762</b></td> </tr> <tr> <td>CodeX-Davinci-3</td> <td>175B</td> <td>0.730</td> <td>0.799</td> <td>0.392</td> <td>0.382</td> <td>0.570</td> </tr> <tr> <td>MPT-7B-instruct</td> <td>7B</td> <td>0.262</td> <td>0.381</td> <td>0.117</td> <td>0.091</td> <td>0.205</td> </tr> <tr> <td>ALPACA</td> <td>7B</td> <td>0.311</td> <td>0.460</td> <td>0.192</td> <td>0.083</td> <td><b>0.242</b></td> </tr> <tr> <td>KOALA</td> <td>7B</td> <td>0.195</td> <td>0.218</td> <td>0.017</td> <td>0.071</td> <td>0.131</td> </tr> <tr> <td>OpenAssistant-pythia</td> <td>12B</td> <td>0.202</td> <td>0.322</td> <td>0.025</td> <td>0.069</td> <td>0.157</td> </tr> <tr> <td>ORCA-mini</td> <td>7B</td> <td>0.243</td> <td>0.280</td> <td>0.101</td> <td>0.076</td> <td>0.169</td> </tr> <tr> <td>LLaMA-2</td> <td>7B</td> <td>0.225</td> <td>0.393</td> <td>0.101</td> <td>0.081</td> <td>0.192</td> </tr> <tr> <td rowspan="4">Code Specific LLM</td> <td>CodeGen2</td> <td>7B</td> <td>0.375</td> <td>0.498</td> <td>0.167</td> <td>0.066</td> <td>0.257</td> </tr> <tr> <td>Starcoder</td> <td>15.5B</td> <td>0.584</td> <td>0.628</td> <td>0.275</td> <td>0.208</td> <td>0.410</td> </tr> <tr> <td>Vicuna</td> <td>7B</td> <td>0.060</td> <td>0.134</td> <td>0.008</td> <td>0.042</td> <td>0.064</td> </tr> <tr> <td>nsql</td> <td>6B</td> <td>0.772</td> <td>0.732</td> <td>0.608</td> <td>0.277</td> <td><b>0.548</b></td> </tr> <tr> <td rowspan="3">Seq-to-Seq Model</td> <td>T5(tscholak/cxmefzzi)</td> <td>3B</td> <td>0.828</td> <td>0.782</td> <td>0.650</td> <td>0.434</td> <td>0.641</td> </tr> <tr> <td>PICARD+T5</td> <td>3B</td> <td>0.790</td> <td>0.799</td> <td>0.558</td> <td>0.502</td> <td>0.652</td> </tr> <tr> <td>RESDSQL</td> <td>3B</td> <td>0.872</td> <td>0.857</td> <td>0.666</td> <td>0.696</td> <td><b>0.775</b></td> </tr> </tbody></table>

The claims are:
Claim 0: |{|Model Type, General LLM|, |Model Name, ChatGPT-3.5-turbo|, |Parameter Size, 175B|, |Dataset, Spider dev|, |Difficulty Level, 1|}, Execution Match , 0.760|
Claim 1:| .... |

Extract all the claims presented in the table and its associated context (including references, captions, and footnotes) for the following html table. Use the provided references, captions, and footnotes to deduce the meanings of acronyms wherever possible. If the meanings are unclear or conflicting, include a note such as '[Unresolved]' instead of making assumptions. For fields that are missing or empty, include them with a placeholder such as 'N/A' while maintaining the exact format.
If multiple claims seem to refer to the same data point but differ slightly, include both claims and add a comment field noting the ambiguity.
"""

# Frase da aggiungere per il contesto della caption
caption_context_text = "\n\nKnow that the context where the table was mentioned is the following:\n"

# Frase da aggiungere per le references
references_context_text = "\n\nThe table is referenced in the paper as follows:\n"

claims_output_format_text = "\n\nProvide the results in a file in .txt format. And remember! To check if you did correct, there should be N claims where N is the number of numeric values in the table."

In [None]:
# Funzione per processare le tabelle nel file JSON
def process_json_tables(input_path, output_path):
    try:
        with open(input_path, "r", encoding="utf-8") as file:
            data = json.load(file)

        for table_id, table_content in data.items():
            if "table" in table_content:
                html_table = ' '.join(table_content["table"].split())
                caption = ' '.join(table_content["caption"].split())

                # Recupera e pulisce le references, se presenti
                references = table_content.get("references", [])
                references = [ref.strip() for ref in references if ref.strip()]  # Rimuovi spazi inutili e riferimenti vuoti
                references_text = "\n".join(references) if references else ""
                references_text = ' '.join(references_text.split())

                # Recupera e pulisce le footnotes, se presenti
                footnotes = table_content.get("footnotes", [])
                footnotes = [note.strip() for note in footnotes if note.strip()]
                footnotes_text = "\n".join(footnotes) if footnotes else ""
                footnotes_text = ' '.join(footnotes_text.split())

                output_file = os.path.join(output_path, f"{table_id}.txt")

                # Scrivi il testo introduttivo, la tabella HTML, la caption e le references in un file separato
                with open(output_file, "w", encoding="utf-8") as outfile:
                    outfile.write(intro_text + "\n" + html_table + "\n")
                    if caption:
                        outfile.write(caption_context_text + caption)
                    if references: 
                        outfile.write(references_context_text + references_text)
                    if footnotes:
                        outfile.write("\n\n" + footnotes_text)
                    outfile.write(claims_output_format_text)
                
                print(f"Tabella elaborata e salvata in: {output_file}")
            else:
                print(f"Avviso: Nessuna tabella valida trovata per ID {table_id}.")
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Errore durante l'elaborazione del file JSON: {e}")

process_json_tables(input_file, output_dir)