In [1]:
import os 
os.chdir('..')

In [2]:
import pandas as pd
from pathlib import Path
from src.utils import read_json

tdmr_data = pd.read_csv("leaderboard-generation/tdm_annotations.tsv", sep="\t")

In [3]:
def change_to_float(value):
    try:
        return float(value)
    except:
        print(value)
        return value


def read_and_process_result_data(result_file_path: str, result_file_column: str = '') -> pd.DataFrame:
    result_json_file = read_json(Path(result_file_path))
    df = pd.DataFrame.from_dict(result_json_file)
    df['PaperName'] = df['PaperName'] + ".pdf"
    return df 

def handle_list_as_values_within_result_column(df: pd.DataFrame, result_column: str) -> pd.DataFrame:
    if result_column in df.columns:
        df_exploded = df.explode(result_column, ignore_index=True)
        df_exploded[result_column] = df_exploded[result_column].apply(lambda x: change_to_float(x))
        df_exploded['float'] = df_exploded[result_column].apply(lambda x: True if type(x) == float else False)
        df_exploded = df_exploded.drop("float", axis=1)
        return df_exploded
    else:
        print(f"There is no result column:{result_column} within these columns: {df.columns}")
        return df

In [4]:
df = read_and_process_result_data("tdmr_extraction/gpt-4o/with_captions_updated_tables_24_03_new_table_representation/processed_tdmr_extraction.json")

In [5]:
common_papers = list(set(df['PaperName'].unique()).intersection(tdmr_data['PaperName'].unique()))
len(common_papers)

3

In [6]:
common_papers

['1906.05012.pdf', '1909.02188.pdf', '1811.09242.pdf']

In [7]:
df[df['PaperName'].isin(common_papers)][['Task', 'Dataset', 'Metric', 'Result']].head(3)

Unnamed: 0,Task,Dataset,Metric,Result
0,Intent Detection and Slot Filling,ATIS,Accuracy,"[92.6, 91.1, 93.6, 94.1, 96.8, 96.4, 95.0, 96...."
1,Intent Detection and Slot Filling,ATIS,Accuracy,"[96.2, 96.1, 96.7, 95.2, 96, 96.6, 96.9]"
2,Intent Detection and Slot Filling,ATIS,Accuracy,"[96.9, 96.5, 97.5, 97.5]"


In [8]:
tdmr_data[tdmr_data['PaperName'].isin(common_papers[:3])][['Task', 'Dataset', 'Metric', 'Result']].head(3)

Unnamed: 0,Task,Dataset,Metric,Result
79,Summarization,Gigaword,ROGUE-1,39.11
80,Summarization,Gigaword,ROGUE-2,19.78
81,Summarization,Gigaword,ROGUE-L,36.87


### Comparing results on 3 random papers 

In [24]:
papers_to_compare = common_papers[:3]
papers_to_compare

['1811.09242.pdf', '1909.02188.pdf', '1906.05012.pdf']

In [25]:
compare_df = df[df['PaperName'].isin(papers_to_compare)]
compare_df.shape

(60, 5)

In [22]:
compare_tdmr_data = tdmr_data[tdmr_data['PaperName'].isin(papers_to_compare)]
compare_tdmr_data.shape

(15, 7)

#### Extracted datasets comparison

In [27]:
compare_df['Dataset'].unique()

array(['ATIS', 'Gigaword', "'Gigaword'", 'SemEval 2013 Task 13'],
      dtype=object)

In [28]:
compare_tdmr_data['Dataset'].unique()

array(['Gigaword', 'SemEval 2013 Task 13', 'SemEval 2010 Task 14', 'ATIS',
       'SNIPS'], dtype=object)

### First paper

In [29]:
compare_df[compare_df['PaperName'] == papers_to_compare[0]]

Unnamed: 0,Task,Dataset,Metric,Result,PaperName
48,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[6.5, 6.0, 6.96, 1.6, 3.5, 7.14, 7.62, 7.69, 7...",1811.09242.pdf
49,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[41.4, 13.3, 8.9, 9, 42.5, 44.1, 41.6, 41.3, 4...",1811.09242.pdf
50,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[6.5, 6.0, 6.96, 1.6, 3.5, 7.14, 7.62, 7.69, 7...",1811.09242.pdf
51,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[{'Model': 'LDA', 'S = 5': 41.4, 'S = 25': 13....",1811.09242.pdf
52,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[6.5, 6.0, 6.96, 1.6, 3.5, 7.14, 7.62, 7.69, 7...",1811.09242.pdf
53,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[41.4, 13.3, 8.9, 9, 42.5, 44.1, 41.6, 41.3, 4...",1811.09242.pdf
54,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[6.5, 6.0, 6.96, 1.6, 3.5, 7.14, 7.62, 7.69, 7...",1811.09242.pdf
55,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[{'Model': 'LDA', 'S = 5': 41.4, 'S = 25': 13....",1811.09242.pdf
56,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[6.5, 6.0, 6.96, 1.6, 3.5, 7.14, 7.62, 7.69, 7...",1811.09242.pdf
57,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),"[41.4, 13.3, 8.9, 9, 42.5, 44.1, 41.6, 41.3, 4...",1811.09242.pdf


In [37]:
compare_tdmr_data.columns

Index(['PaperURL', 'PaperName', 'Task', 'Dataset', 'Metric', 'Result',
       'Comment'],
      dtype='object')

In [36]:
compare_tdmr_data[compare_tdmr_data['PaperName'] == papers_to_compare[0]][['Task', 'Dataset', 'Metric','PaperName', 'Result']]

Unnamed: 0,Task,Dataset,Metric,PaperName,Result
79,Summarization,Gigaword,ROGUE-1,1906.05012.pdf,39.11
80,Summarization,Gigaword,ROGUE-2,1906.05012.pdf,19.78
81,Summarization,Gigaword,ROGUE-L,1906.05012.pdf,36.87


### Second paper 

In [39]:
compare_df[compare_df['PaperName'] == papers_to_compare[1]]

Unnamed: 0,Task,Dataset,Metric,PaperName,Result
228,Word Sense Induction,SemEval 2010,F1-score,1811.09242.pdf,62.9
229,Word Sense Induction,SemEval 2013,F1-score,1811.09242.pdf,24.37
230,Word sense induction,SemEval 2010,V-measure (V-M),1811.09242.pdf,4.4
231,Word sense induction,SemEval 2010,F-score (F-S),1811.09242.pdf,62.9
232,Word sense induction,SemEval 2013,Fuzzy B-cubed (F-BC),1811.09242.pdf,24.37
233,Word sense induction,SemEval 2013,Fuzzy normalized mutual information (F-NMI),1811.09242.pdf,24.37


In [41]:
papers_to_compare

['1906.05012.pdf', '1811.09242.pdf', '1909.02188.pdf']

In [40]:
compare_tdmr_data[compare_tdmr_data['PaperName'] == papers_to_compare[1]][['Task', 'Dataset', 'Metric','PaperName', 'Result']]


Unnamed: 0,Task,Dataset,Metric,PaperName,Result
246,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,9.55
247,Word Sense Induction,SemEval 2013 Task 13,Fuzzy B-Cubed (FBC),1811.09242.pdf,62.2
248,Word Sense Induction,SemEval 2013 Task 13,AVG,1811.09242.pdf,24.37
249,Word Sense Induction,SemEval 2010 Task 14,F-Score (F-S),1811.09242.pdf,62.9
250,Word Sense Induction,SemEval 2010 Task 14,V-Measure (V-M),1811.09242.pdf,10.1
251,Word Sense Induction,SemEval 2010 Task 14,AVG,1811.09242.pdf,25.2


### Third paper

In [90]:
papers_to_compare[2]

'1909.02188.pdf'

In [89]:
compare_df[compare_df['PaperName'] == papers_to_compare[2]]

Unnamed: 0,Task,Dataset,Metric,PaperName,Result
78,Intent Prediction,ATIS,Accuracy,1909.02188.pdf,86.5
79,Sentence-level Semantic Frame Parsing,ATIS,Overall Accuracy,1909.02188.pdf,86.5
80,Intent Detection,ATIS,Accuracy,1909.02188.pdf,86.5


In [91]:
compare_tdmr_data[compare_tdmr_data['PaperName'] == papers_to_compare[2]][['Task', 'Dataset', 'Metric','PaperName', 'Result']]

Unnamed: 0,Task,Dataset,Metric,PaperName,Result
266,Intent Detection and Slot Filling,ATIS,F1,1909.02188.pdf,95.9
267,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,96.9
268,Intent Detection and Slot Filling,ATIS,Overall-Accuracy,1909.02188.pdf,86.5
269,Intent Detection and Slot Filling,SNIPS,F1,1909.02188.pdf,94.2
270,Intent Detection and Slot Filling,SNIPS,Accuracy,1909.02188.pdf,98.0
271,Intent Detection and Slot Filling,SNIPS,Overall-Accuracy,1909.02188.pdf,86.9


## Updated approach on 07.02


In [15]:
# result_json_file = 
read_json_new = read_json('tdmr_extraction/gpt-4o/with_captions_updated_tables_24_03_new_table_representation/processed_tdmr_extraction.json')
papers_to_compare = common_papers[:3]
updated_df = pd.DataFrame.from_dict(read_json_new)
updated_df['PaperName'] = updated_df['PaperName'] + ".pdf"
updated_df_exploded = updated_df.explode('Result', ignore_index=True)


In [16]:
updated_df_exploded

Unnamed: 0,Task,Dataset,Metric,Result,PaperName
0,Intent Detection and Slot Filling,ATIS,Accuracy,92.6,1909.02188.pdf
1,Intent Detection and Slot Filling,ATIS,Accuracy,91.1,1909.02188.pdf
2,Intent Detection and Slot Filling,ATIS,Accuracy,93.6,1909.02188.pdf
3,Intent Detection and Slot Filling,ATIS,Accuracy,94.1,1909.02188.pdf
4,Intent Detection and Slot Filling,ATIS,Accuracy,96.8,1909.02188.pdf
...,...,...,...,...,...
467,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),41.9,1811.09242.pdf
468,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),44.4,1811.09242.pdf
469,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),45.5,1811.09242.pdf
470,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),46.6,1811.09242.pdf


### First paper

In [17]:
updated_df_exploded[updated_df_exploded['PaperName'] == papers_to_compare[0]]

Unnamed: 0,Task,Dataset,Metric,Result,PaperName
0,Intent Detection and Slot Filling,ATIS,Accuracy,92.6,1909.02188.pdf
1,Intent Detection and Slot Filling,ATIS,Accuracy,91.1,1909.02188.pdf
2,Intent Detection and Slot Filling,ATIS,Accuracy,93.6,1909.02188.pdf
3,Intent Detection and Slot Filling,ATIS,Accuracy,94.1,1909.02188.pdf
4,Intent Detection and Slot Filling,ATIS,Accuracy,96.8,1909.02188.pdf
...,...,...,...,...,...
215,Intent Detection and Slot Filling,ATIS,Accuracy,96.9,1909.02188.pdf
216,Intent Detection and Slot Filling,ATIS,Accuracy,96.9,1909.02188.pdf
217,Intent Detection and Slot Filling,ATIS,Accuracy,96.5,1909.02188.pdf
218,Intent Detection and Slot Filling,ATIS,Accuracy,97.5,1909.02188.pdf


In [19]:
updated_df_exploded['Result'] = updated_df_exploded['Result'].apply(lambda x: change_to_float(x))
updated_df_exploded['float'] = updated_df_exploded['Result'].apply(lambda x: True if type(x) == float else False)
updated_df_exploded_float = updated_df_exploded[updated_df_exploded['float']]

{'Model': 'LDA', 'S = 5': 41.4, 'S = 25': 13.3, 'S = 50': 8.9, 'S = 100': 9}
{'Model': 'HC', 'S = 5': 42.5, 'S = 25': 44.1, 'S = 50': 41.6, 'S = 100': 41.3}
{'Model': 'STM', 'S = 5': 44.9, 'S = 25': 44.4, 'S = 50': 44.9, 'S = 100': 41.9}
{'Model': 'AutoSense', 'S = 5': 44.4, 'S = 25': 45.5, 'S = 50': 46.6, 'S = 100': 46.5}
{'Model': 'LDA', 'S = 5': 41.4, 'S = 25': 13.3, 'S = 50': 8.9, 'S = 100': 9}
{'Model': 'HC', 'S = 5': 42.5, 'S = 25': 44.1, 'S = 50': 41.6, 'S = 100': 41.3}
{'Model': 'STM', 'S = 5': 44.9, 'S = 25': 44.4, 'S = 50': 44.9, 'S = 100': 41.9}
{'Model': 'AutoSense', 'S = 5': 44.4, 'S = 25': 45.5, 'S = 50': 46.6, 'S = 100': 46.5}


In [42]:
updated_df_exploded_float[updated_df_exploded_float['PaperName'] == papers_to_compare[0]][['Task', 'Dataset', 'Metric','PaperName', 'Result']]

Unnamed: 0,Task,Dataset,Metric,PaperName,Result
0,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,92.6
1,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,91.1
2,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,93.6
3,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,94.1
4,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,96.8
...,...,...,...,...,...
215,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,96.9
216,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,96.9
217,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,96.5
218,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,97.5


In [48]:
updated_df_exploded_float[updated_df_exploded_float['PaperName'] == papers_to_compare[1]][['Task', 'Dataset', 'Metric','PaperName', 'Result']]


Unnamed: 0,Task,Dataset,Metric,PaperName,Result
310,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,6.5
311,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,6.0
312,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,6.96
313,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,1.6
314,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,3.5
...,...,...,...,...,...
467,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,41.9
468,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,44.4
469,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,45.5
470,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,46.6


In [41]:
updated_df_exploded_float[updated_df_exploded_float['PaperName'] == papers_to_compare[0]][['Task', 'Dataset', 'Metric','PaperName', 'Result']].groupby(['Task', 'Dataset', 'Metric']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PaperName,Result
Task,Dataset,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1
Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,97.5


In [36]:
compare_tdmr_data[compare_tdmr_data['PaperName'] == papers_to_compare[0]][['Task', 'Dataset', 'Metric','PaperName', 'Result']]

Unnamed: 0,Task,Dataset,Metric,PaperName,Result
266,Intent Detection and Slot Filling,ATIS,F1,1909.02188.pdf,95.9
267,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,96.9
268,Intent Detection and Slot Filling,ATIS,Overall-Accuracy,1909.02188.pdf,86.5
269,Intent Detection and Slot Filling,SNIPS,F1,1909.02188.pdf,94.2
270,Intent Detection and Slot Filling,SNIPS,Accuracy,1909.02188.pdf,98.0
271,Intent Detection and Slot Filling,SNIPS,Overall-Accuracy,1909.02188.pdf,86.9


### Difference caused: 

In the other table within this paper there's a higher value extracted than the one reported by authors

### Second paper

In [75]:
papers_to_compare[1]

'1811.09242.pdf'

In [80]:
updated_df_exploded['float'] = updated_df_exploded['Result'].apply(lambda x: True if type(x) == float else False)

In [58]:
updated_df_exploded[(updated_df_exploded['PaperName'] == papers_to_compare[1])].groupby(['Task', 'Dataset', 'Metric'])['Result'].tail()# .groupby(['Task', 'Dataset', 'Metric'])

ValueError: Cannot subset columns with a tuple with more than one element. Use a list instead.

In [83]:
compare_tdmr_data[(compare_tdmr_data['PaperName'] == papers_to_compare[1]) & (updated_df_exploded['float'] == True)].groupby(['Task', 'Dataset', 'Metric']).max()

  compare_tdmr_data[(compare_tdmr_data['PaperName'] == papers_to_compare[1]) & (updated_df_exploded['float'] == True)].groupby(['Task', 'Dataset', 'Metric']).max()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PaperURL,PaperName,Result,Comment
Task,Dataset,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Word Sense Induction,SemEval 2010 Task 14,AVG,https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,25.2,
Word Sense Induction,SemEval 2010 Task 14,F-Score (F-S),https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,62.9,
Word Sense Induction,SemEval 2010 Task 14,V-Measure (V-M),https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,10.1,
Word Sense Induction,SemEval 2013 Task 13,AVG,https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,24.37,
Word Sense Induction,SemEval 2013 Task 13,Fuzzy B-Cubed (FBC),https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,62.2,
Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,9.55,


In [85]:
compare_tdmr_data[compare_tdmr_data['PaperName'] == papers_to_compare[1]][['Task', 'Dataset', 'Metric','PaperName', 'Result']]

Unnamed: 0,Task,Dataset,Metric,PaperName,Result
246,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),1811.09242.pdf,9.55
247,Word Sense Induction,SemEval 2013 Task 13,Fuzzy B-Cubed (FBC),1811.09242.pdf,62.2
248,Word Sense Induction,SemEval 2013 Task 13,AVG,1811.09242.pdf,24.37
249,Word Sense Induction,SemEval 2010 Task 14,F-Score (F-S),1811.09242.pdf,62.9
250,Word Sense Induction,SemEval 2010 Task 14,V-Measure (V-M),1811.09242.pdf,10.1
251,Word Sense Induction,SemEval 2010 Task 14,AVG,1811.09242.pdf,25.2


### Third paper

In [92]:
third_paper_json_file_path = '/Users/Michal/Dokumenty_mac/MasterThesis/master_thesis/tdmr_extraction/gpt-4o/with_captions_updated_tables_07_02_new_table_representation/1909.02188/1909.02188_tdmr_extraction.json'
third_paper_json_file = read_json(Path(third_paper_json_file_path))
thrid_papeR_df = pd.DataFrame.from_dict(third_paper_json_file)

In [98]:
thrid_paper_df = thrid_papeR_df.explode('Result', ignore_index=True)

In [87]:
updated_df_exploded[updated_df_exploded['PaperName'] == papers_to_compare[2]].groupby(['Task', 'Dataset', 'Metric']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Result,PaperName,float
Task,Dataset,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Intent Detection,ATIS,Accuracy,88.6,1909.02188.pdf,True
Intent Prediction,ATIS,Accuracy,88.6,1909.02188.pdf,True
Sentence-level Semantic Frame Parsing,ATIS,Overall Accuracy,88.6,1909.02188.pdf,True
Slot Filling,ATIS,F1,,1909.02188.pdf,True


In [104]:
thrid_paper_df['float'] = thrid_paper_df['Result'].apply(lambda x: True if type(x) == float else False)
thrid_paper_df[thrid_paper_df['float'] == True].groupby(['Task', 'Dataset', 'Metric']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Result,float
Task,Dataset,Metric,Unnamed: 3_level_1,Unnamed: 4_level_1
Intent Detection,ATIS,Accuracy,97.5,True
Intent Detection,SNIPS,Accuracy,99.0,True
Intent Prediction,ATIS,Accuracy,97.5,True
Intent Prediction,SNIPS,Accuracy,99.0,True
Sentence-level Semantic Frame Parsing,ATIS,Overall Accuracy,88.6,True
Sentence-level Semantic Frame Parsing,SNIPS,Overall Accuracy,92.9,True
Slot Filling,ATIS,F1,96.1,True
Slot Filling,ATIS,F1 Score,96.1,True
Slot Filling,SNIPS,F1,97.0,True
Slot Filling,SNIPS,F1 Score,97.0,True


In [88]:
compare_tdmr_data[compare_tdmr_data['PaperName'] == papers_to_compare[2]][['Task', 'Dataset', 'Metric','PaperName', 'Result']]


Unnamed: 0,Task,Dataset,Metric,PaperName,Result
266,Intent Detection and Slot Filling,ATIS,F1,1909.02188.pdf,95.9
267,Intent Detection and Slot Filling,ATIS,Accuracy,1909.02188.pdf,96.9
268,Intent Detection and Slot Filling,ATIS,Overall-Accuracy,1909.02188.pdf,86.5
269,Intent Detection and Slot Filling,SNIPS,F1,1909.02188.pdf,94.2
270,Intent Detection and Slot Filling,SNIPS,Accuracy,1909.02188.pdf,98.0
271,Intent Detection and Slot Filling,SNIPS,Overall-Accuracy,1909.02188.pdf,86.9


In [105]:
compare_tdmr_data

Unnamed: 0,PaperURL,PaperName,Task,Dataset,Metric,Result,Comment
79,https://arxiv.org/pdf/1906.05012.pdf,1906.05012.pdf,Summarization,Gigaword,ROGUE-1,39.11,
80,https://arxiv.org/pdf/1906.05012.pdf,1906.05012.pdf,Summarization,Gigaword,ROGUE-2,19.78,
81,https://arxiv.org/pdf/1906.05012.pdf,1906.05012.pdf,Summarization,Gigaword,ROGUE-L,36.87,
246,https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,Word Sense Induction,SemEval 2013 Task 13,Fuzzy normalized mutual information (FNMI),9.55,
247,https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,Word Sense Induction,SemEval 2013 Task 13,Fuzzy B-Cubed (FBC),62.2,
248,https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,Word Sense Induction,SemEval 2013 Task 13,AVG,24.37,
249,https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,Word Sense Induction,SemEval 2010 Task 14,F-Score (F-S),62.9,
250,https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,Word Sense Induction,SemEval 2010 Task 14,V-Measure (V-M),10.1,
251,https://arxiv.org/pdf/1811.09242.pdf,1811.09242.pdf,Word Sense Induction,SemEval 2010 Task 14,AVG,25.2,
266,https://arxiv.org/pdf/1909.02188.pdf,1909.02188.pdf,Intent Detection and Slot Filling,ATIS,F1,95.9,


### Experiments based on fixed normalization process

In [7]:
fixed_df_json_file_path = 'tdmr_extraction/gpt-4o/with_captions_updated_tables_05_04_new_table_representation/processed_tdmr_extraction_test_papers.json'
fixed_data_dict = read_json(fixed_df_json_file_path)

In [8]:
df = pd.DataFrame.from_dict(fixed_data_dict)
df['PaperName'] = df['PaperName'] + ".pdf"
df = df.explode('Result', ignore_index=True)

In [9]:
papers_to_compare = df['PaperName'].unique()

In [10]:
df[df['PaperName'] == papers_to_compare[0]].groupby(["Task", "Dataset", "Metric"])['Result']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x116bd7cd0>

In [14]:
grouped_df = df[df['PaperName'] == papers_to_compare[0]].groupby(['PaperName', 'Task', 'Metric', 'Dataset'])[
    'Result'].max()
grouped_df

PaperName       Task                                   Metric            Dataset
1909.02188.pdf  Intent Detection and Slot Filling      'Accuracy'        SNIPS      99.0
                                                       Accuracy          ATIS       97.5
                                                                         SNIPS      99.0
                                                       F1                ATIS       96.1
                                                                         SNIPS      97.0
                                                       State-of-the-art  SNIPS      98.0
                Sentence-level Semantic Frame Parsing  Overall-Accuracy  ATIS       88.6
                                                                         SNIPS      92.9
Name: Result, dtype: object

In [12]:
grouped_df = df[df['PaperName'] == papers_to_compare[0]].groupby(['PaperName', 'Task', 'Metric', 'Dataset'])['Result'].apply(lambda x: ', '.join(map(str, x))).reset_index()
grouped_df

Unnamed: 0,PaperName,Task,Metric,Dataset,Result
0,1909.02188.pdf,Intent Detection and Slot Filling,'Accuracy',SNIPS,"96.9, 96.7, 97.0, 96.8, 97.5, 97.2, 97.3, 97.0..."
1,1909.02188.pdf,Intent Detection and Slot Filling,Accuracy,ATIS,"92.6, 91.1, 93.6, 94.1, 96.8, 96.4, 95.0, 96.6..."
2,1909.02188.pdf,Intent Detection and Slot Filling,Accuracy,SNIPS,"96.9, 96.7, 97.0, 96.8, 97.5, 97.2, 97.3, 97.0..."
3,1909.02188.pdf,Intent Detection and Slot Filling,F1,ATIS,"94.3, 94.2, 94.8, 95.2, 95.1, 95.5, 95.2, 95.6..."
4,1909.02188.pdf,Intent Detection and Slot Filling,F1,SNIPS,"87.3, 87.8, 88.8, 88.3, 90.0, 93.5, 91.8, 90.5..."
5,1909.02188.pdf,Intent Detection and Slot Filling,State-of-the-art,SNIPS,"94.2, 98.0, 86.9, 98, 92.8, 92.9"
6,1909.02188.pdf,Sentence-level Semantic Frame Parsing,Overall-Accuracy,ATIS,"80.7, 78.9, 82.2, 82.6, 82.2, 85.7, 83.4, 86.0..."
7,1909.02188.pdf,Sentence-level Semantic Frame Parsing,Overall-Accuracy,SNIPS,"73.2, 74.1, 75.5, 74.6, 81.0, 83.8, 80.9, 78.4..."


In [13]:
tdmr_data[tdmr_data['PaperName'] == papers_to_compare[0]].groupby(["Task", "Dataset", "Metric"])['Result'].max()

Task                               Dataset  Metric          
Intent Detection and Slot Filling  ATIS     Accuracy            96.9
                                            F1                  95.9
                                            Overall-Accuracy    86.5
                                   SNIPS    Accuracy            98.0
                                            F1                  94.2
                                            Overall-Accuracy    86.9
Name: Result, dtype: float64

### Results analysis when extending experiment with model/approach/alogrithm information

In [33]:
combined_results_file_path = "extending_results_extraction_with_author_approach/gpt-4o/tdmr_extraction_with_author_data_combined.json"

In [34]:
author_model_results_df = read_and_process_result_data(combined_results_file_path)

In [25]:
author_model_results_df.head(3)

Unnamed: 0,Task,Dataset,Metric,Result,Model,PaperName
0,Intent Detection and Slot Filling,ATIS,F1,[86.5],Stack-Propagation,1909.02188.pdf
1,Intent Detection and Slot Filling,ATIS,Accuracy,[86.5],Stack-Propagation,1909.02188.pdf
2,Intent Detection and Slot Filling,ATIS,Accuracy,86.5,Stack-Propagation,1909.02188.pdf


In [35]:
author_model_results_df_with_no_list = handle_list_as_values_within_result_column(author_model_results_df, result_column="Result")

In [36]:
author_model_results_df_with_no_list = author_model_results_df_with_no_list

In [37]:
author_model_results_df_with_no_list.head(3)

Unnamed: 0,Task,Dataset,Metric,Result,Model,PaperName
0,Intent Detection and Slot Filling,ATIS,F1,86.5,Stack-Propagation,1909.02188.pdf
1,Intent Detection and Slot Filling,ATIS,Accuracy,86.5,Stack-Propagation,1909.02188.pdf
2,Intent Detection and Slot Filling,ATIS,Accuracy,86.5,Stack-Propagation,1909.02188.pdf


In [38]:
grouped_df = author_model_results_df_with_no_list[author_model_results_df_with_no_list['PaperName'] == common_papers[0]].groupby(['PaperName', 'Task', 'Metric', 'Dataset', 'Model'])[
    'Result'].max()
grouped_df

PaperName       Task           Metric   Dataset     Model                                                        
1906.05012.pdf  Summarization  ROGUE-1  'Gigaword'  Bi-Directional Selective Encoding with Template model (BiSET)    39.11
                                                    Bi-directional Selective Encoding with Template (BiSET)          39.11
                                                    Bi-selective layer                                               39.11
                                                    BiSET                                                            39.11
                                                    Fast Rerank                                                      39.11
                                                    N-Optimal                                                        40.49
                                                    Retrieve-top                                                     23.46
                         

In [39]:
tdmr_data[tdmr_data['PaperName'] == common_papers[0]].groupby(["Task", "Dataset", "Metric"])['Result'].max()


Task           Dataset   Metric 
Summarization  Gigaword  ROGUE-1    39.11
                         ROGUE-2    19.78
                         ROGUE-L    36.87
Name: Result, dtype: float64

##### Second paper

In [40]:
grouped_df = \
author_model_results_df_with_no_list[author_model_results_df_with_no_list['PaperName'] == common_papers[1]].groupby(
    ['PaperName', 'Task', 'Metric', 'Dataset', 'Model'])[
    'Result'].max()
grouped_df

PaperName       Task                                   Metric            Dataset  Model                      
1909.02188.pdf  Intent Detection and Slot Filling      Accuracy          ATIS     Our model                      88.6
                                                                                  Our model + BERT               88.6
                                                                                  Stack-Propagation              88.6
                                                                                  Stack-Propagation framework    88.6
                                                                                  lstm+token-level               88.6
                                                       F1                ATIS     Our model                      88.6
                                                                                  Our model + BERT               88.6
                                                                

In [41]:
tdmr_data[tdmr_data['PaperName'] == common_papers[1]].groupby(["Task", "Dataset", "Metric"])['Result'].max()


Task                               Dataset  Metric          
Intent Detection and Slot Filling  ATIS     Accuracy            96.9
                                            F1                  95.9
                                            Overall-Accuracy    86.5
                                   SNIPS    Accuracy            98.0
                                            F1                  94.2
                                            Overall-Accuracy    86.9
Name: Result, dtype: float64

In [46]:
grouped_df = author_model_results_df_with_no_list[author_model_results_df_with_no_list['PaperName'] == common_papers[2]].groupby(['PaperName', 'Task', 'Metric', 'Dataset', 'Model'])['Result'].apply(lambda x: ', '.join(map(str, x))).reset_index()
grouped_df

Unnamed: 0,PaperName,Task,Metric,Dataset,Model,Result
0,1811.09242.pdf,Word Sense Induction,F-Score (F-S),SemEval 2010 Task 14,AutoSense,"61.7, 62.9, 61.2, 56.2, 56.4, 57.9, 58.8, 44.4..."
1,1811.09242.pdf,Word Sense Induction,F-Score (F-S),SemEval 2010 Task 14,AutoSenses=100,"61.2, 58.8, 46.5"
2,1811.09242.pdf,Word Sense Induction,F-Score (F-S),SemEval 2010 Task 14,AutoSenses=5,"62.9, 56.2, 44.4"
3,1811.09242.pdf,Word Sense Induction,F-Score (F-S),SemEval 2010 Task 14,AutoSenses=7,"61.7, 62.9, 61.2, 56.2, 56.4, 57.9, 58.8, 44.4..."
4,1811.09242.pdf,Word Sense Induction,F-Score (F-S),SemEval 2010 Task 14,AutoSense−sw,"61.1, 44.4, 45.5, 46.6, 46.5"
5,1811.09242.pdf,Word Sense Induction,F-Score (F-S),SemEval 2010 Task 14,AutoSense−wp,"59.3, 44.4, 45.5, 46.6, 46.5"
6,1811.09242.pdf,Word Sense Induction,F1,SemEval 2010 Task 14,AutoSense,"61.7, 62.9, 61.2, 56.2, 56.4, 57.9, 58.8, 44.4..."
7,1811.09242.pdf,Word Sense Induction,F1,SemEval 2010 Task 14,AutoSenses=100,"61.2, 58.8, 46.5"
8,1811.09242.pdf,Word Sense Induction,F1,SemEval 2010 Task 14,AutoSenses=5,"62.9, 56.2, 44.4"
9,1811.09242.pdf,Word Sense Induction,F1,SemEval 2010 Task 14,AutoSenses=7,"62.9, 56.2, 56.4, 57.9, 58.8, 46.6"


In [43]:
tdmr_data[tdmr_data['PaperName'] == common_papers[2]].groupby(["Task", "Dataset", "Metric"])['Result'].max()


Task                  Dataset               Metric                                    
Word Sense Induction  SemEval 2010 Task 14  AVG                                           25.20
                                            F-Score (F-S)                                 62.90
                                            V-Measure (V-M)                               10.10
                      SemEval 2013 Task 13  AVG                                           24.37
                                            Fuzzy B-Cubed (FBC)                           62.20
                                            Fuzzy normalized mutual information (FNMI)     9.55
Name: Result, dtype: float64