In [1]:
import os
import pandas as pd
from ipywidgets import Layout, Button, Box, VBox
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output
import pickle
from IPython.core.display import display, HTML
import qgrid
from sklearn.metrics import precision_recall_fscore_support
import re

In [2]:
# val_tes_data = pickle.load(open('./val_test_data.pkl', 'rb'))
# print(type(val_tes_data))
# print(val_tes_data[0])

In [3]:
'''
1. Current val_test_data.pkl contains list of dictionaries, where each dict contains the whole table along with its info.
2. Create a folder 'val_test_data' (or any other desired name), in which every table's data will be aved under the 
folder name obtained from its pii+ '__'+t_idx. Save the table info as 'init.pkl' inside the respective folder. 
Eg - home/val_test_data/S0022309300000053__2/init.pkl
3. With the help of qgrid in this jupyter notebook, we are creating an interactive grid for manually annotating the tables 
(dataframes) and saving the updated table as 'final.pkl' in the same folder. Eg- home/val_test_data/S0022309300000053__2/final.pkl
'''
annotated_dir = './nominal_analyzed_annotated_piis/' #folder in which pickles of individual tables were dumped as init.pkl
paths = []
for f in os.listdir(annotated_dir):
    paths.append(os.path.join(annotated_dir, f))
paths = sorted(paths)
tot = len(paths)
tot

264

In [4]:
def get_json_dict(i):
    if os.path.exists(os.path.join(paths[i], 'new.pkl')):
        d = pickle.load(open(os.path.join(paths[i], 'new.pkl'), 'rb'))
    else:
        d = pickle.load(open(os.path.join(paths[i], 'init.pkl'), 'rb'))
    return d

def show_comp_tables(i):
    d = get_json_dict(i)
    print(d.keys())
    display(HTML(f'<a href="https://www.sciencedirect.com/science/article/pii/{d["pii"]}", target="_blank">Paper Link</a>'))
    print('Table: ', d['t_idx']+1)
    r, c = d['num_rows'], d['num_cols']
    table = d['act_table'].copy() 
    for i in range(r): 
        table[i] = [d['row_label'][i]] + table[i] 
    table = [['']] + table 
    for i in range(c): 
        table[0].append(d['col_label'][i])
    return pd.DataFrame(table), d['caption'].replace('\n', ' ') if d['caption'] else ''

In [5]:
i = 82
qgrid_widget_0 = None

prev_button = Button(description='Prev') 
prev_button.button_style = 'danger' 
buttons_info = widgets.Output() 

def display_things(caption):
    global qgrid_widget_0 
    print('------------------------------------------------')
    print('Instructions:')
    print('row_col_label: 1->composition, 2->constituent, 3->ID, 0->others')
    print('------------------------------------------------')
    print(f'Caption: {caption}')
    display(qgrid_widget_0)
    return

def prev_button_display(b): 
    with buttons_info: 
        clear_output()
        global i, qgrid_widget_0
        i = (i + tot - 1) % tot
        print('Index: ', i)
        df0, caption = show_comp_tables(i)
        #print(type(df0))
        qgrid_widget_0 = qgrid.show_grid(df0) 
        display_things(caption) 
        next_button.description = f'Next ({(i+1)%tot})' 
        prev_button.description = f'Prev ({(i+tot-1)%tot})'
        print("Table Type: 0-> NCT, 1-> SCC, 2->MCC-PI, 3->MCC-CI")

prev_button.on_click(prev_button_display) 

next_button = Button(description='Next')
next_button.button_style = 'danger'

def next_button_display(b): 
    with buttons_info:
        clear_output()
        global i, qgrid_widget_0
        i = (i + 1) % tot
        print('Index: ', i)
        df0, caption = show_comp_tables(i)
        qgrid_widget_0 = qgrid.show_grid(df0)
        display_things(caption)
        next_button.description = f'Next ({(i+1)%tot})'
        prev_button.description = f'Prev ({(i+tot-1)%tot})' 
        print("Table Type: 0-> NCT, 1-> SCC, 2->MCC-PI, 3->MCC-CI")
        
next_button.on_click(next_button_display)

save_button = Button(description='Save')
save_button.button_style = 'warning'

def save_button_display(b): 
    with buttons_info:
        global i, qgrid_widget_0
        d = get_json_dict(i)
        assert d['act_table'] == qgrid_widget_0.get_changed_df().iloc[1:, 1:].values.tolist() #why assert is used?
        d['row_label'] = qgrid_widget_0.get_changed_df().iloc[1:, 0].values.tolist()
        d['col_label'] = qgrid_widget_0.get_changed_df().iloc[0, 1:].values.tolist()
        d['row_label'], d['col_label'] = list(map(int, d['row_label'])), list(map(int, d['col_label']))
        d['new_table_type'] = T.value
        pickle.dump(d, open(os.path.join(paths[i], 'new.pkl'), 'wb'))
        print('saved!')

save_button.on_click(save_button_display)

H = widgets.HBox([prev_button, save_button, next_button])
T = widgets.Dropdown(options=[0, 1, 2, 3], value=2, description='Table Type:',disabled=False)
U = widgets.VBox([H, buttons_info, T, H])

In [6]:
display(U)

VBox(children=(HBox(children=(Button(button_style='danger', description='Prev', style=ButtonStyle()), Button(b…

In [None]:
# annotations complete?
remaining_idxs = []
for idx, p in enumerate(paths):
    if not os.path.exists(os.path.join(p, 'new.pkl')):
        remaining_idxs.append(idx)
if len(remaining_idxs) == 0:
    print('Good job!')
else:
    print('Not so fast!')
    print(f'{len(remaining_idxs)}/{tot} annotations left. :(') 
    print(f'Remaining idxs: {remaining_idxs}')

In [None]:
# row label and col label should contain either 1 or 2
to_correct_idxs = []
for idx, p in enumerate(paths): 
    if os.path.exists(os.path.join(p, 'new.pkl')):
        d = pickle.load(open(os.path.join(p, 'new.pkl'), 'rb'))
        for k in ['row_label', 'col_label']:
            print(idx)
            print(d.keys())
            
            #if you accidentally label as 4,5,6 etc OR composition and compound cant be on the same side
            if d[k].count(0) + d[k].count(1) + d[k].count(2) + d[k].count(3) != len(d[k]) or d[k].count(1) * d[k].count(2) > 0: 
                to_correct_idxs.append(idx)
                break

if len(to_correct_idxs) == 0:
    print('Good job!')
else:
    print('Not so fast!')
    print(f'{len(to_correct_idxs)}/{tot} corrections required. :(')
    print(f'Idxs: {to_correct_idxs}')

In [None]:
# all comp tables and row col constraints
# row_label and col_label both can't have the label 1 or 2
to_correct_idxs = []
for idx, p in enumerate(paths):
    if os.path.exists(os.path.join(p, 'new.pkl')):
        d = pickle.load(open(os.path.join(p, 'new.pkl'), 'rb'))
        if d['row_label'].count(1) * d['col_label'].count(1) + d['row_label'].count(2) * d['col_label'].count(2) > 0:
            to_correct_idxs.append(idx)

if len(to_correct_idxs) == 0:
    print('Good job!')
else:
    print('Not so fast!')
    print(f'{len(to_correct_idxs)}/{tot} corrections required. :(')
    print(f'Idxs: {to_correct_idxs}')