In [2]:
import pandas as pd 
import json 
from pathlib import Path 
from collections import defaultdict

In [3]:
EXPERIMENT_DIR='parsing_experiments/28_09_2024v2/'

In [4]:
def read_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError:
        print(f"Error decoding JSON from file: {file_path}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [6]:
df = pd.read_csv("/Users/Michal/Dokumenty_mac/MasterThesis/master_thesis/parsing_experiments/28_12_2024_gpt-4o/2305.14336v3/manual_extracted_tables/df_2.csv")

In [7]:
df

Unnamed: 0,Unnamed: 1,Model (GPU hours),.1,Token-Level F1,.2,.3,EM,.4
0,,,P,R,F1,P,R,F1
1,Teacher,code-davinci-002,74.1,71.8,72.3,59.4,56.9,57.6
2,Student,LLaMA-7B (50h),74.1,67.6,69.1,56.8,53.4,54.3
3,,Alpaca-7B (50h),72.7,64.8,67.5,56.1,50.0,52.0
4,,T5-11B (380h),75.8,71.4,73.2,60.3,56.7,58.1


In [15]:
paper_output_dict = {}
for paper_dir in Path(EXPERIMENT_DIR).iterdir():
    if paper_dir.is_dir():
        result_dict = {}
        dfs = []
        for result_file in paper_dir.iterdir():
            if result_file.suffix == '.csv':
                dfs.append(pd.read_csv(result_file))
            elif result_file.suffix == '.json':
                extracted_text = read_json_file(result_file)
                result_dict.update({"text": extracted_text})
        result_dict.update({"dfs":dfs})
        paper_output_dict.update({paper_dir.name:result_dict})
                           

In [16]:
len(paper_output_dict)

30

In [17]:
# Papers names
paper_output_dict.keys()

dict_keys(['2409.03710v1', '2409.03225v1', '2409.03735v1', '2402.07745v1', '2409.03171v1', '2409.01854v1', '2409.02864v1', '2409.03757v1', '2408.16284v1', '2409.03438v1', '2409.03659v1', '2305.14336v3', '2409.03291v1', '2409.03618v1', 'applsci-13-10918', '2406.04383v2', '2409.03516v1', '2409.02813v1', '2409.01195v1', '2409.03542v1', '2409.03669v1', '2409.01685v1', '2409.02130v1', '2409.03245v1', '2409.03755v1', '2409.03715v1', '2001.01469v1', '2408.11381v1', '2409.02141v1', '2409.03745v1'])

## Sections text extraction analysis

In [18]:
paper_empty_section_dict = defaultdict(list)
for paper in paper_output_dict.keys():
    sections_text = paper_output_dict[paper]['text']
    for section in sections_text.keys():
        if len(sections_text[section]) == 0:
            paper_empty_section_dict[paper].append(section)

In [21]:
len(paper_empty_section_dict) # Due to wrong parsing of tables of content in .md file

1

In [20]:
for paper in paper_empty_section_dict.keys():
    print(f"Paper with empty sections: {paper}")
    for section in paper_empty_section_dict[paper]:
        print(section)
    print("#"*100)

Paper with empty sections: 2409.02864v1
2.1.2 Rag Assessment (Ragas) Evaluation Metrics
####################################################################################################


### Verifying end and the beginning of extracted sections given the example of 2001.01469v1 paper

In [34]:
for section in paper_output_dict['2001.01469v1']['text'].keys():
    print(section)
    section_text = paper_output_dict['2001.01469v1']['text'][section]
    print(len(section_text))
    print(section_text[:100])
    print(section_text[-100:])
    print("#"*100)

Table Ii.
1801
## Table Ii.

Similarly, in Experiment 2, we used the modified Marmot data-set where, the words in e
 made with richer semantic knowledge, and additional branches for learning row based segmentation.


####################################################################################################
Iv. Table Row Extraction
1664
## Iv. Table Row Extraction

After processing the documents using TableNet, masks for table and colu
mpletely filled and there are no line demarcations, each line (level) can be seen as a unique row.


####################################################################################################
I. Introduction
4379
## I. Introduction

With the proliferation of mobile devices equipped with cameras, an increasing nu
riment details and results. Finally, the conclusions and future work are presented in Section VII.


####################################################################################################
Vii. Conclusion
5202
## V

In [35]:
dfs = paper_output_dict['2001.01469v1']['dfs']

In [40]:
dfs[0]

Unnamed: 0.1,Model,Recall,Precision,F1-Score,The content in,each,cell was,Unnamed: 0,normalized;,white,spaces,Unnamed: 1
0,,,,,"were removed,",,special characters,,,were replaced,,with
1,TableNet + Semantic Features,,,,,,,,,,,
2,,0.9001,0.9307,0.9151,,,,,,,,
3,(fine-tuned on ICDAR),,,,underscores and,,lowercase,letters,,with uppercase.,,This
4,,,,,1D-tuple can then be compared with the ground ...,,,,,,,
5,TableNet + Semantic Features,0.8994,0.9255,0.9122,,,,,,,,
6,,,,,using precision and recall.,,,,,,,
7,TableNet,0.8987,0.9215,0.9098,,,,,,,,
8,DeepDeSRT [8],0.8736,0.9593,0.9144,TableNet requires,both,table,and,structure,annotated,,data


## Checking lama approach for extracting tables 

In [23]:
LAMA_EXPERIMENT_DIR='parsing_experiments/29_09_2024_llama_parse'

In [27]:
dfs_dict = {}
for paper_output_dir in Path(LAMA_EXPERIMENT_DIR).iterdir():
    dfs_dict[paper_output_dir.name] = []
    if paper_output_dir.is_dir():
        for result_file in paper_output_dir.iterdir():
            if result_file.suffix == '.csv':
                dfs_dict[paper_output_dir.name].append(pd.read_csv(result_file))


In [28]:
dfs_dict.keys()

dict_keys(['.DS_Store', '2305.14336v3', 'applsci-13-10918', '2406.04383v2', '2001.01469v1'])

In [29]:
dfs = dfs_dict['2305.14336v3']

In [44]:
dfs[10]

Unnamed: 0,Unnamed: 1,ML (ours),Chemistry (ours),DISCOMAT (2022),SWDE (2011)
0,Textual format,LATEX,XML,CSV,HTML
1,# cell types,4,6,2,8
2,# attr. types,11,4,4,32
3,# papers (web.),25,16,2536,80
4,# tables (pages),122,26,5883,1600
5,# anno. records,3792,1498,58481,1600
6,# records / table,31.1,57.6,9.9,1


## Comparing results of various approaches for extraction of complex tables 

#### 1. Tabula

In [5]:
tabula_table_path = '/Users/Michal/Dokumenty_mac/MasterThesis/master_thesis/various_tables_extraction_approaches_summary_dir/1603.01354/tabula_extracted_files/df_5.csv'
tabula_df = pd.read_csv(tabula_table_path)

In [6]:
tabula_df

Unnamed: 0.1,Unnamed: 0,POS,Unnamed: 1,Unnamed: 2,NER,Unnamed: 3,Unnamed: 4
0,,Dev Test,,Dev,,Test,
1,Model,Acc. Acc.,Prec.,Recall,F1 Prec.,Recall,F1
2,BRNN,96.56 96.76,92.04,89.13,90.56 87.05,83.88,85.44
3,BLSTM,96.88 96.93,92.31,90.85,91.57 87.77,86.23,87.00
4,BLSTM-CNN,97.34 97.33,92.52,93.64,93.07 88.53,90.21,89.36
5,BRNN-CNN-CRF,97.46 97.55,94.85,94.63,94.74 91.35,91.06,91.21


#### 2. Marker + Regex 

In [7]:
marker_table_path = '/Users/Michal/Dokumenty_mac/MasterThesis/master_thesis/various_tables_extraction_approaches_summary_dir/1603.01354/manual_extracted_tables/df_2.csv'
marker_df = pd.read_csv(marker_table_path)

In [8]:
marker_df

Unnamed: 0,Unnamed: 1,POS,.1,.2,.3,.4,NER,.5,.6
0,,Dev,Test,,Dev,,,Test,
1,Model,Acc.,Acc.,Prec.,Recall,F1,Prec.,Recall,F1
2,BRNN,96.56,96.76,92.04,89.13,90.56,87.05,83.88,85.44
3,BLSTM,96.88,96.93,92.31,90.85,91.57,87.77,86.23,87.00
4,BLSTM-CNN,97.34,97.33,92.52,93.64,93.07,88.53,90.21,89.36
5,BRNN-CNN-CRF,97.46,97.55,94.85,94.63,94.74,91.35,91.06,91.21


#### 3. LLM section based approach (GPT-4o) 

In [9]:
section_based_llm_approach_table_path = '/Users/Michal/Dokumenty_mac/MasterThesis/master_thesis/various_tables_extraction_approaches_summary_dir/1603.01354/llm_extracted_tables/4.2 Main Resultslen_23100.csv'
section_based_df = pd.read_csv(section_based_llm_approach_table_path)

In [10]:
section_based_df

Unnamed: 0,Model,POS Dev Acc.,POS Test Acc.,POS Prec.,POS Recall,POS F1,NER Dev Prec.,NER Dev Recall,NER Dev F1
0,BRNN,96.56,96.76,92.04,89.13,90.56,87.05,83.88,85.44
1,BLSTM,96.88,96.93,92.31,90.85,91.57,87.77,86.23,87.0
2,BLSTM-CNN,97.34,97.33,92.52,93.64,93.07,88.53,90.21,89.36
3,BRNN-CNN-CRF,97.46,97.55,94.85,94.63,94.74,91.35,91.06,91.21


#### 4. Entire file LLM approach (GPT-4o)

In [21]:
from io import StringIO

# json_file_path = '/Users/Michal/Dokumenty_mac/MasterThesis/master_thesis/entire_pdf_approach_results-redefined-prompt-csv/1603.01354/1603.01354_4.json'
# # from src.utils import read_json
# with open(json_file_path, "r") as json_file:
#         data= json.load(json_file)
json_file_path = '/Users/Michal/Dokumenty_mac/MasterThesis/master_thesis/various_tables_extraction_approaches_summary_dir/1603.01354/llm_extracted_tables/4.2 Main Resultslen_2310.json'
with open(json_file_path, "r") as json_file:
        data= json.load(json_file)
entire_df = pd.read_csv(StringIO(data['Table 3']), sep=",")

In [22]:
entire_df

Unnamed: 0,Model,POS Dev Acc.,POS Test Acc.,POS Prec.,POS Recall,POS F1,NER Dev Prec.,NER Dev Recall,NER Dev F1
0,BRNN,96.56,96.76,92.04,89.13,90.56,87.05,83.88,85.44
1,BLSTM,96.88,96.93,92.31,90.85,91.57,87.77,86.23,87.0
2,BLSTM-CNN,97.34,97.33,92.52,93.64,93.07,88.53,90.21,89.36
3,BRNN-CNN-CRF,97.46,97.55,94.85,94.63,94.74,91.35,91.06,91.21


#### 5. Docling

In [3]:
docling_df = pd.read_csv('/Users/Michal/Dokumenty_mac/MasterThesis/docling_tryout/results/1603.01354/2.csv')

In [4]:
docling_df

Unnamed: 0.1,Unnamed: 0,Model,POS.Dev.Acc.,POS.Test.Acc.,NER.Dev.Prec.,NER.Dev.Recall,NER.Dev.F1,NER.Test.Prec.,NER.Test.Recall,NER.Test.F1
0,0,BRNN,96.56,96.76,92.04,89.13,90.56,87.05,83.88,85.44
1,1,BLSTM,96.88,96.93,92.31,90.85,91.57,87.77,86.23,87.0
2,2,BLSTM-CNN,97.34,97.33,92.52,93.64,93.07,88.53,90.21,89.36
3,3,BRNN-CNN-CRF,97.46,97.55,94.85,94.63,94.74,91.35,91.06,91.21


### 6. Unstructured approach

In [5]:
unstructured_json_file = read_json_file('/Users/Michal/Dokumenty_mac/MasterThesis/master_thesis/tables_with_captions_dir_unstructured/1603.01354_tables_with_captions.json')
html_tables = unstructured_json_file['table_html']
tables = []
for hml_table in html_tables:
    tables.append(pd.read_html(hml_table))

In [14]:
df = tables[2][0]

In [15]:
df

Unnamed: 0_level_0,Model,POS,POS,NER,NER,NER,NER,NER,NER,NER,Unnamed: 10_level_0,Unnamed: 11_level_0
Unnamed: 0_level_1,Model,Unnamed: 1_level_1,Dev,Test,Dev,Dev,Dev,| Test,| Test,| Test,| Test,| Test
Unnamed: 0_level_2,Model,Unnamed: 1_level_2,Acc.,Acc.,| Prec.,Recall,F1,:,Prec.,Recall,F1,Unnamed: 11_level_2
0,BRNN,96.56,96.76,| 92.04,89.13,90.56,',87.05,83.88,85.44,,
1,BLSTM,96.88,96.93,| 92.31,90.85,91.57,",",87.77,8623.0,87.0,,
2,BLSTM-CNN,97.34,9733.0,| 9252,93.64,93.07,|,88.53,9021.0,89.36,,
3,BRNN-CNN-CRF,| 97.46,97.55,| 94.85,94.63,94.74,,19135.0,91.06,91.21,,
