In [4]:
import json 
import os
import random
from pprint import pprint
import re

JSON_DIR = '../lucenehw/urls_htmls_tables/all_tables'
DEST_DIR = 'improved_json/'

### Empty JSON remover
This function deletes useless empty json file.
It checks if the file is empty or if the only content is a field `INFO` that usually says `"There are no tables in this article"`  

### Define JSON to sample here
Random sampling

Rerun multiple times this section to change sampled files, otherwise never run again this section if you're working with this first specific sampled files

In [54]:
all_files = [file for file in os.listdir(JSON_DIR) if os.path.isfile(os.path.join(JSON_DIR, file))]

sampled_files = random.sample(all_files, min(100, len(all_files)))
sampled_files

['2407.17788v1.json',
 '2410.04960.json',
 '2409.01887v1.json',
 '2306.03013.json',
 '2409.11219.json',
 '2306.05706.json',
 '2010.12229.json',
 '2010.07427.json',
 '2209.00564.json',
 '2409.16016.json',
 '2409.09515.json',
 '2304.04312v1.json',
 '226_arXiv2407.04925.json',
 '2108.06912.json',
 '2010.13058.json',
 '1806.03724v1.json',
 '2408.09545.json',
 '2304.14663.json',
 '2202.04311.json',
 '2403.07444.json',
 '2406.17475v1.json',
 '2409.13146.json',
 '287_arXiv2406.03714.json',
 '2410.04263.json',
 '2405.03152.json',
 '2204.09746.json',
 '2409.12626.json',
 '2308.11730.json',
 '2108.10568v1.json',
 '2402.09795.json',
 '2401.12438.json',
 '2101.00191.json',
 '2406.03423v1.json',
 '2404.19310.json',
 '1905.13648v2.json',
 '2009.00081.json',
 '2105.02584v1.json',
 '144_arXiv2406.19215.json',
 '2408.11271.json',
 '2403.03075.json',
 '2406.11580.json',
 '2409.07793.json',
 '2405.16472.json',
 '2312.15608.json',
 '2312.17029.json',
 '2111.01322v1.json',
 '2406.08396.json',
 '1908.10423v

In [156]:
def delete_empty_json(input_path):
    try:
        with open(input_path, 'r') as infile:
            data = json.load(infile)
        
        if isinstance(data, dict):
            if not data or set(data.keys()) == {"INFO"}:
                os.remove(input_path)
                
                print(f"Deleted empty file: {input_path}")
                return 
    except json.JSONDecodeError as e:
        print(f"Unexpected error in decoding JSON {input_path}, {e}")
    except FileNotFoundError:
        print(f"File not found: {input_path}")
    except Exception as e:
        print(f"Unexpected error in {input_path}, {e}")

In [157]:
# CHANGE THIS TO DESIRED FOLDER
for filename in all_files:
    delete_empty_json(os.path.join(JSON_DIR, filename))

### JSON Fixxer 
This function correct the bad formatted json, some json has this problem:
```json
{
    ... data here
}
{
    ... data here
}
{
    ...
}
```

when the correct formatting should be:

```json
{
    ... data here
    ,
    ... data here
    ,
    ... data here
}
```

So the replacement from `'}{'` to `','` is exactly what the function does

In [None]:
# Function to easy fix bad formatting of json file
def fix_json(input_path, output_path):
    try:
        with open(input_path, 'r') as file:
            content = file.read()
        fixed_content = content.replace('}{', ',')
        
        with open(output_path, 'w') as file:
            file.write(fixed_content)
        
        print(f"Fixed JSON saved to: {output_path}")
    except Exception as e:
        print(f"Unexpected error in {input_path}, {e}")


In [None]:
for filename in all_files:
    input_file = os.path.join(JSON_DIR, filename)
    output_file = os.path.join(JSON_DIR, filename)
    try:
        with open(input_file, 'rb') as i:
            data = json.load(i)
    except json.JSONDecodeError as e:
        print(f"error decoding this JSON file: {filename}, {e} ")
        fix_json(input_file, output_path=output_file)

### JSON Cleaner

In [71]:
pattern1 = r"^(S|A|Ch)[a-zA-Z0-9]*\d+\.T[a-zA-Z0-9]*\d+(\..*)?$|^global_footnotes$"
pattern2 = r"^id_table_\d+$"
pattern3 = r'id="(S|A|Ch)[a-zA-Z0-9]*\d+\.T[a-zA-Z0-9]*\d+(\..*)?"'
pattern4 = r"^PAPER'S NUMBER OF TABLES$"
number_of_key_removed = 0
occ = 0


# this function unrolls recursively lists, used for case when data[key]['table'] is a nested list
def process_element(element):
    if isinstance(element, str):
        if not (re.match(pattern1, element) or re.findall(pattern3, element)):
            return True
    elif isinstance(element, list):
        for sub_element in element:
            if process_element(sub_element):
                return True
    return False

def process_json(data, filename):
    global number_of_key_removed
    keys_to_remove = []

    for key in data.keys():
        if not re.match(pattern1, key) and not re.match(pattern2, key):
            keys_to_remove.append(key)
            
        if re.match(pattern4, key):
            keys_to_remove.append(key)

        if re.match(pattern2, key):
            if isinstance(data[key]['table'], str):
                if not re.match(pattern1, data[key]['table']) and not re.findall(pattern3, data[key]['table']):
                    keys_to_remove.append(key)

            elif isinstance(data[key]['table'], list):
                # same thing as before but you're searching in a list
                if process_element(data[key]['table']):
                    keys_to_remove.append(key)
                    
    if len(keys_to_remove) > 0:
        number_of_key_removed += len(keys_to_remove)
        for key in keys_to_remove:
            if key in data:
                
                # !!use this to debug!!
                # if key == "PAPER'S NUMBER OF TABLES" or key == "INFO":
                #     del data[key]
                #     continue
                
                # print(f"file: {filename}")
                # if isinstance(data[key], dict):
                #     print(f"Key to delete: {key}")
                #     print(f"Value: {data[key]['table']}")
                # else:
                #     print(f"Key to delete: {key}")
                #     print(f"Value: {data[key]}")

                # input("Press enter to delete")
                del data[key]
        return True
    else:
        return False
        

In [None]:
for filename in all_files:
    input_file = os.path.join(JSON_DIR, filename)
    output_file = os.path.join(DEST_DIR, filename)
    with open(input_file, 'rb') as i:
        data = json.load(i)

        if process_json(data, filename):
            occ += 1
            if len(data.keys()) > 0:
                with open(output_file, 'w') as o:
                    json.dump(data, o, indent=2)
            else:
                os.remove(filename) # ?? check this 
print(f"{occ} files were cleaned, which is {occ/len(all_files) * 100}% of total files")
print(f"{number_of_key_removed} keys were removed")

4859 files were cleaned, which is f55.28501536010923% of total files
15926 keys were removed


### Test

In [52]:
import re

string1 = ['S2.E1', 'S1.F1', 'A3.E5', 'A3.F4.E1', 'S12.T1', 'S1.T3.1.2', 'A1.T1', 'Sx4.T5.1.m.2']
string2 = ['id_table_1', 'id_figure_1', 'id_table_12']
string3 = ['id="S1.T1"', 'id="A1.T2"', 'id="S2.T1.1"', "<table id=\"S5.T2.1\"", "id=\"Sx1.T1.4.4\"", "id=\"S1.F1.5.1.1\"", "id=\"S3.Ex3.m1.1.2.3.2.1\""]
string4 = ["<table class=\"ltx_tabular ltx_centering ltx_figure_panel ltx_align_middle\" id=\"S1.F1.5\">\n<tr class=\"ltx_tr\" id=\"S1.F1.5.1\">\n<td class=\"ltx_td ltx_align_center\"",
           "<table id=\"S1.T1.1\" class=\"ltx_tabular ltx_guessed_headers ltx_align_middle",
           "S2.T1.1.1", "S4.E7", "A6.EGx1"
           ]

matches1 = [s for s in string1 if not re.match(pattern1, s)]
matches2 = [s for s in string2 if not re.match(pattern2, s)]
matches3 = [s for s in string3 if not re.findall(pattern3, s)]
matches4 = [s for s in string4 if not re.match(pattern1, s) and not re.findall(pattern3, s)]

print(matches1)
print(matches2)
print(matches3)
print(matches4)

['S2.E1', 'S1.F1', 'A3.E5', 'A3.F4.E1']
['id_figure_1']
['id="S1.F1.5.1.1"', 'id="S3.Ex3.m1.1.2.3.2.1"']
['<table class="ltx_tabular ltx_centering ltx_figure_panel ltx_align_middle" id="S1.F1.5">\n<tr class="ltx_tr" id="S1.F1.5.1">\n<td class="ltx_td ltx_align_center"', 'S4.E7', 'A6.EGx1']


In [53]:
test_cases = [
    ("Simple Valid String", "S5.T2.1", False),
    ("Simple Invalid String", "<table id=\"A4.E2.2.m.11\"", True),
    ("Valid Nested List", ["S5.T2.1", "S6.T3.1"], False),
    ("Nested List with Invalid Element", ["S5.T2.1", "invalid_element"], True),
    ("Deeply Nested List with Invalid Element", ["S5.T2.1", ["S6.T3.1", "invalid_element"]], True),
    ("Valid Element Matching pattern3", 'id="S5.T2.1"', False),
    ("Mixed List with Invalid Element", ['id="S5.T2.1"', "invalid_element"], True),
    ("Empty List", [], False),
    ("List with Only Valid Nested Lists", [["S5.T2.1", "S6.T3.1"], ["S7.T4.2", "S8.T5.3"]], False),
    ("List with Deeply Nested Invalid String", [["S5.T2.1", "S6.T3.1"], ["S7.T4.2", "invalid_element"]], True),
]

for name, element, expected in test_cases:
    result = process_element(element)
    assert result == expected, f"Test '{name}' failed: expected {expected}, got {result}"
    print(f"Test '{name}' passed.")

Test 'Simple Valid String' passed.
Test 'Simple Invalid String' passed.
Test 'Valid Nested List' passed.
Test 'Nested List with Invalid Element' passed.
Test 'Deeply Nested List with Invalid Element' passed.
Test 'Valid Element Matching pattern3' passed.
Test 'Mixed List with Invalid Element' passed.
Test 'Empty List' passed.
Test 'List with Only Valid Nested Lists' passed.
Test 'List with Deeply Nested Invalid String' passed.


Empty improved_json folder

In [10]:
!rm improved_json/*