In [1]:
import json
from collections import defaultdict
from bs4 import BeautifulSoup
import pandas as pd

input_file = 'input_file.json'

In [2]:
# Salva tutte le tabelle del JSON di input in un dizionario 'tables'
tables = defaultdict(dict)

with open(input_file, 'r') as file:
    data = json.load(file)
    for key, value in data.items():
        tables[key] = value
print(tables.keys())

dict_keys(['2304.04370_1', '2304.04370_2', '2304.04370_3', '2308.12519_1', '2308.12519_2', '2308.12519_3', '2310.01444_2', '2310.01444_4', '2310.03965_1', '2310.03965_2', '2402.10890_1', '2402.10890_2', '2402.10890_3', '2403.04783_1', '2403.04783_2', '2403.04783_4', '2405.17129_1', '2405.17129_4', '2405.17129_5', '2406.03075_2', '2406.03075_3', '2406.03075_4', '2406.12707_1', '2406.12707_2', '2406.12707_3', '2407.16667_1', '2407.16667_3', '2407.16667_4', '2407.17115_2', '2407.17115_4', '2407.17115_5'])


In [None]:
def extract_claims_from_table(html):
    # Parsing del contenuto HTML
    soup = BeautifulSoup(html, 'html.parser')
    
    # Trova la tabella
    table = soup.find('table')
    
    # Trova tutte le righe della tabella
    rows = table.find_all('tr')
    
    # Estrai il valore della cella in alto a sinistra (prima riga, prima colonna)
    table_specification_name = rows[0].find_all('td')[0].get_text(strip=True)
    
    # Estrai i nomi delle colonne dalla prima riga (escludendo la prima cella)
    headers = [header.get_text(strip=True) for header in rows[0].find_all('td')[1:]]
    
    claims = []
    claim_counter = 0
    
    # Itera sulle righe, a partire dalla seconda riga (escludi la riga di intestazione)
    for row in rows[1:]:
        cells = row.find_all('td')
        if len(cells) < 2:  # Salta le righe con meno di due celle
            continue
        
        # Estrai il valore della prima cella della riga corrente
        row_specification = cells[0].get_text(strip=True)
        
        # Crea una claim per ogni misura
        for i, value_cell in enumerate(cells[1:]):  # Escludi la prima cella
            measure_name = headers[i]  # Nome della misura dalla prima riga
            value = value_cell.get_text(strip=True)
            claims.append(f"Claim {claim_counter}: |{{|{table_specification_name}, {row_specification}|}} {measure_name}, {value}|")
            claim_counter += 1

    return claims

#|{Specification, Specification, …}, Measure, Outcome|

# HTML di esempio
table_html = tables['2405.17129_1']['table']  # Inserisci qui il tuo HTML completo

claims = extract_claims_from_table(table_html)

# Stampa le claims
for claim in claims:
    print(claim)


In [10]:
html_table = tables['2304.04370_1']['table']
soup = BeautifulSoup(html_table, 'html.parser')
rows = soup.find_all('tr')

headers_cell = rows[0].find_all(['th', 'td'])

# Intestazioni
headers = []
for cell in headers_cell:
    if 'colspan' in cell.attrs:
        for subheader in rows[1].find_all(['th','td'])[:int(cell['colspan'])]:
            headers.append(f"{cell.text.strip()} {subheader.text.strip()}")
    else:
        headers.append(cell.text.strip())

# Dati
data = []
for row in rows[2:]:
    data.append([cell.text.strip() for cell in row.find_all('td')])

# Creazione DataFrame
df = pd.DataFrame(data, columns=headers)

# Stampa il DataFrame
print(df.columns)


Index(['Metrics', 'GPT-3.5-turbo Zero', 'GPT-3.5-turbo Few', 'Claude-2 Zero',
       'Claude-2 Few', 'GPT-4 Zero', 'GPT-4 Few'],
      dtype='object')


In [12]:
def extract_claims_from_dataframe(df):
    claims = []
    claim_counter = 0
    
    # Assumiamo che la prima colonna contenga la Specification
    # e le colonne successive contengano le misure
    table_specification_name = df.columns[0]  # Il nome della prima colonna
    
    for idx, row in df.iterrows():
        row_specification = row[0]  # La prima colonna è la Specification per ogni riga
        
        # Per ogni misura nella riga, creiamo una claim
        for i, measure_value in enumerate(row[1:]):  # Escludiamo la prima colonna (Specification)
            measure_name = df.columns[i + 1]  # Il nome della misura (dalla colonna del DataFrame)
            value = measure_value
            claims.append(f"Claim {claim_counter}: |{{|{table_specification_name}, {row_specification}|}} {measure_name}, {value}|")
            claim_counter += 1
    
    return claims


# Estrai le claims dal DataFrame
claims = extract_claims_from_dataframe(df)

# Stampa le claims
for claim in claims:
    print(claim)

Claim 0: |{|Metrics, CLIP Score|} GPT-3.5-turbo Zero, 0.0|
Claim 1: |{|Metrics, CLIP Score|} GPT-3.5-turbo Few, 0.0|
Claim 2: |{|Metrics, CLIP Score|} Claude-2 Zero, 0.0|
Claim 3: |{|Metrics, CLIP Score|} Claude-2 Few, 0.2543|
Claim 4: |{|Metrics, CLIP Score|} GPT-4 Zero, 0.0|
Claim 5: |{|Metrics, CLIP Score|} GPT-4 Few, 0.3055|
Claim 6: |{|Metrics, BERT Score|} GPT-3.5-turbo Zero, 0.1914|
Claim 7: |{|Metrics, BERT Score|} GPT-3.5-turbo Few, 0.3820|
Claim 8: |{|Metrics, BERT Score|} Claude-2 Zero, 0.2111|
Claim 9: |{|Metrics, BERT Score|} Claude-2 Few, 0.5038|
Claim 10: |{|Metrics, BERT Score|} GPT-4 Zero, 0.2076|
Claim 11: |{|Metrics, BERT Score|} GPT-4 Few, 0.6307|
Claim 12: |{|Metrics, ViT Score|} GPT-3.5-turbo Zero, 0.2437|
Claim 13: |{|Metrics, ViT Score|} GPT-3.5-turbo Few, 0.7497|
Claim 14: |{|Metrics, ViT Score|} Claude-2 Zero, 0.4082|
Claim 15: |{|Metrics, ViT Score|} Claude-2 Few, 0.5416|
Claim 16: |{|Metrics, ViT Score|} GPT-4 Zero, 0.5058|
Claim 17: |{|Metrics, ViT Score|} 

  row_specification = row[0]  # La prima colonna è la Specification per ogni riga
