In [1]:
import pandas as pd

df = pd.read_csv('../../dataset/tables.csv')
df

Unnamed: 0,id,caption,table,footnotes,references,id_file
0,S3.T1,Table 1: Table of Collected Labels & Definitio...,"<table class=""ltx_tabular ltx_centering ltx_al...",[],[],./extraction/2407.16895v1.json
1,S4.T1,Table 1. Lift in podcast listening time and ov...,"<table class=""ltx_tabular ltx_centering ltx_gu...",[],['We A/B-tested multinomial blending on an Ama...,./extraction/2408.09168v1.json
2,S3.T1,Table 1. Top 10 most streamed music tracks by ...,"<table class=""ltx_tabular ltx_align_middle"" id...",[],"['To go further, we present in Table\xa01 the ...",./extraction/2408.16430v1.json
3,S3.T2,"Table 2. Percentages of (i) labeled streams, (...","<table class=""ltx_tabular ltx_align_middle"" id...",[],['Table\xa02 presents the proportions of label...,./extraction/2408.16430v1.json
4,S3.T1,Table 1: Results of two two-way ANOVAs for the...,"<table class=""ltx_tabular ltx_align_middle"" id...",[],['While our analysis found no significant diff...,./extraction/2409.15998v1.json
...,...,...,...,...,...,...
3354,S4.T8,"['<figcaption class=""ltx_caption ltx_centering...","['<table class=""ltx_tabular ltx_guessed_header...",,[],./extraction/2410.07654v1.json
3355,S4.T1,"['<figcaption class=""ltx_caption ltx_centering...","['<table class=""ltx_tabular ltx_align_middle"" ...",,[],./extraction/2410.07671v1.json
3356,S5.T2,"['<figcaption class=""ltx_caption ltx_centering...","['<table class=""ltx_tabular ltx_align_middle"" ...",,[],./extraction/2410.07671v1.json
3357,S5.T3,"['<figcaption class=""ltx_caption ltx_centering...","['<table class=""ltx_tabular ltx_align_middle"" ...",,[],./extraction/2410.07671v1.json


# Preprocessing
splitting the data into 2 dataframes:
- tables
- tables_columns

In [2]:
import re
from lxml import html
from nltk.stem import PorterStemmer
from unidecode import unidecode

def get_table_columns(html_input):
    # Clean the HTML input
    html_input = html_input.replace('\n', '')
    tree = html.fromstring(html_input)

    # Extract the first row (header)
    header_row = tree.xpath('//tr[1]')

    if header_row:
        columns = []
        # Find all cells (<th> or <td>) in the first row
        cells = header_row[0].xpath('.//th | .//td')

        for cell in cells:
            # Extract regular text or the alternative text from <math> elements
            cell_text = ''.join(cell.xpath('.//text()')).strip()
            alttext = cell.xpath('.//math/@alttext')

            if alttext:
                cell_text = alttext[0]

            # Add the cell text to columns if it's not empty
            if cell_text:
                columns.append(clean_text(cell_text))
        return columns
    else:
        return []

def get_table_rows(html_input):
    # Analizza l'input HTML
    tree = html.fromstring(html_input)
    
    # Utilizza XPath per estrarre le righe della tabella
    rows = []
    for row in tree.xpath('//tr')[1:]:
        # Estrae il testo da tutte le celle (sia th che td) in una riga
        cells = row.xpath('.//th//text() | .//td//text()')

        # Aggiunge la riga pulita alla lista delle righe
        if cells:
            rows.append(clean_text(cells))
    
    return rows

def clean_text(text):
    # Remove leading and trailing whitespaces
    #cast to string
    text = str(text)

    text = text.strip()
    
    # Remove HTML tags and attributes
    text = re.sub(r'<[^>]*>', '', text)
    
    # De-accent
    text = unidecode(text)
    
    # sub dash underscore  with pipe
    text = re.sub(r'[-_]', '|', text)
    
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Stemming
    stemmer = PorterStemmer()
    text = '|'.join([stemmer.stem(word) for word in text.split()])
    
    return text

def get_id_file(path):
    file = path.split('/')[-1]
    return '.'.join(file.split('.')[:2])



In [3]:
# df_tables is the dataframe that contains the tables df

df_tables = df.copy()

#clean the id_file column
df_tables['id_file'] = df_tables['id_file'].apply(get_id_file)

df_tables['columns'] = df_tables['table'].apply(lambda x: get_table_columns(x))
df_tables['n_columns'] = df_tables['columns'].apply(len)

df_tables['rows'] = df_tables['table'].apply(lambda x: get_table_rows(x))
df_tables['n_rows'] = df_tables['rows'].apply(len)

df_tables['n_cells'] = df_tables['n_columns'] * df_tables['n_rows']
df_tables = df_tables[df_tables['n_cells'] > 0]

df_tables['caption'] = df_tables['caption'].apply(clean_text)
df_tables['caption_len'] = df_tables['caption'].apply(lambda x: len(x) if type(x) == str else 0)

df_tables['n_references'] = df_tables['references'].apply(lambda x: len([item for item in x[1:-1].split(', ') if item]) if isinstance(x, str) else 0)
df_tables['references'] = df_tables['references'].apply(lambda x: [clean_text(y) for y in x[1:-1].split(', ')] if isinstance(x, str) else [])

df_tables['n_footnotes'] = df_tables['footnotes'].apply(lambda x: len([item for item in x[1:-1].split(', ') if item]) if isinstance(x, str) else 0)
df_tables['footnotes'] = df_tables['footnotes'].apply(lambda x: [clean_text(y) for y in x[1:-1].split(', ')] if isinstance(x, str) else [])

df_tables = df_tables.drop(columns=['table'])

df_tables

Unnamed: 0,id,caption,footnotes,references,id_file,columns,n_columns,rows,n_rows,n_cells,caption_len,n_references,n_footnotes
0,S3.T1,tabl|1|tabl|of|collect|label|definit|we|pregen...,[],[],2407.16895v1,"[collect|label, definit]",2,"[n|n|paper|titl|n|n|n|n|as|provid|n|n, n|n|aut...",27,54,153,0,0
1,S4.T1,tabl|1|lift|in|podcast|listen|time|and|overal|...,[],[we|abtest|multinomi|blend|on|an|amazon|music|...,2408.09168v1,"[algorithm, podcast|listen|time, overal|engag]",3,"[mmr|1357|276, mb|1882|223]",2,6,83,17,0
2,S3.T1,tabl|1|top|10|most|stream|music|track|by|frenc...,[],"[to|go|further, we|present|in|tablexa01|the|to...",2408.16430v1,"[dataset, artistband, titl, countrylabel, sing...",7,[lfm2b|portishead|glori|box|gb|en|1994|trip|ho...,20,140,144,29,0
3,S3.T2,tabl|2|percentag|of|i|label|stream|ii|local|st...,[],[tablexa02|present|the|proport|of|label|stream...,2408.16430v1,"[countri, label|sourc, labeledstream, local|st...",5,"[franc|deezer|activ|76|50|38, deezer|origin|75...",9,45,339,23,0
4,S3.T1,tabl|1|result|of|two|twoway|anova|for|the|depe...,[],[while|our|analysi|found|no|signific|differ|in...,2409.15998v1,"[transpar, satisfact]",2,"[f|p|f|p, creator|1915|0167|creator|0302|0583,...",4,8,177,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3354,S4.T8,tabl|viii|contribut|of|differ|modal|and|kg|inf...,[],[],2410.07654v1,"[ba, ka, va, ta, set, r20, m20, n20, h20, p20]",10,"[cold|036|005|011|048|002, warm|1281|576|655|1...",12,120,56,0,0
3355,S4.T1,n|tabl|i|n|statist|of|all|experiment|datasetsnnn,[],[],2410.07671v1,"[statist, technolog, servic, edurec]",4,"[candid|4726|10022|61567, item|34962|23866|208...",6,24,48,0,0
3356,S5.T2,n|tabl|ii|n|perform|of|disco|embed|in|four|bas...,[],[],2410.07671v1,"[dataset, technolog, servic]",3,[n|n|n|n|base|model|n|n|method|auc|hr5|ndcg5|h...,21,63,474,0,0
3357,S5.T3,n|tabl|iii|n|perform|of|disco|and|baselin|on|t...,[],[],2410.07671v1,"[dataset, edurec]",2,[n|n|n|n|base|n|n|model|n|n|method|auc|hr5|ndc...,21,42,358,0,0


In [4]:
# df_tables_columns is from parsing html of th df['table'] column
# df_tables_columns structure: table_id, column_name, file_id

df_tables_columns = pd.DataFrame(columns=['table_id', 'column_name', 'id_file'])

for index, row in df.iterrows():
    columns = get_table_columns(row['table'])
    for col in columns:
        df_tables_columns = pd.concat([df_tables_columns, pd.DataFrame({
            'table_id': row['id'],
            'column_name': [col],
            'id_file': [get_id_file(row['id_file'])]
        })])
    
df_tables_columns

Unnamed: 0,table_id,column_name,id_file
0,S3.T1,collect|label,2407.16895v1
0,S3.T1,definit,2407.16895v1
0,S4.T1,algorithm,2408.09168v1
0,S4.T1,podcast|listen|time,2408.09168v1
0,S4.T1,overal|engag,2408.09168v1
...,...,...,...
0,S5.T2,servic,2410.07671v1
0,S5.T3,dataset,2410.07671v1
0,S5.T3,edurec,2410.07671v1
0,S5.T4,dataset,2410.07671v1


In [5]:
# Save the dataframes to csv files
df_tables.to_csv('../../dataset/tables_clean.csv', index=False)
df_tables_columns.to_csv('../../dataset/tables_columns_clean.csv', index=False)