In [4]:
from nltk import PorterStemmer
import json
import os
from unidecode import unidecode
import re


input_dir = '../gym/json/'

### JSON example
# {
#     "PAPER'S NUMBER OF TABLES": 5,   to be removed
#     "S1.T1": {                       the table_id
#         "caption": "Table 1. Existing",
#         "table": "<table id=\"S1.T1.1.1\" +</tr>\n</table>\n\n",
#         "footnotes": [
#             "1"
#         ],
#         "references": [
#             "A",
#         ],
#     },
# }

def read_json(file):
    with open(file, 'r') as f:
        data = json.load(f)
    return data


def clean_text(text):
    # Remove leading and trailing whitespaces
    #cast to string
    text = str(text)

    text = text.strip()

    # Remove HTML tags and attributes
    text = re.sub(r'<[^>]*>', '', text)

    # De-accent
    text = unidecode(text)
    
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Stemming
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])

    return text

In [5]:
# Creating a table with table_id, caption, footnotes, references

import pandas as pd

df = pd.DataFrame(columns=['id, table_id', 'table_body', 'caption', 'references'])

for file in os.listdir(input_dir):
    data = read_json(input_dir + file)
    for table_id in data:
        if table_id == "PAPER'S NUMBER OF TABLES":
            continue
        table = data[table_id]
        table_body = clean_text(table['table'])
        caption = clean_text(table['caption'])
        references = clean_text(' '.join(table['references']))
        df.loc[len(df)] = [table_id, table_body, caption, references]

df.head()

Unnamed: 0,"id, table_id",table_body,caption,references
0,S4.T1,dataset testa testb recal precis recal precis ...,tabl 1 perform comparison among train dataset ...,we evalu the perform of the propos framework o...
1,S4.T2,random weight 3dwreal weight 3dwsyn weight rec...,tabl 2 train on 3dwreal 3dwsyn with three set ...,final we studi the effect of pretrain weight u...
2,S2.T1,notat descript nknknk total number of edg node...,tabl i the notat frequent use in thi paper,in thi paper we consid a typic mec network whe...
3,S3.T1,categori symbol type descript feder learn fram...,tabl i variabl in feder learn framework,in thi section the architectur and implement d...
4,S4.T2,model type train method recal mean recal highe...,tabl ii predict accuraci of differ train model,four motor data set refer to section iva belon...


In [6]:
df.to_csv('data.csv', index=True)

In [7]:
df[54:55]

Unnamed: 0,"id, table_id",table_body,caption,references
54,A4.T4,dataset cifar10 domainnet xgluenc qa model cli...,tabl 4 implement detail,
