In [3]:
import pandas as pd

df = pd.read_csv('../../dataset/tables.csv')
df

Unnamed: 0,id,caption,table,footnotes,references,id_file
0,S5.T1,Table 1: Instruction tuning data with differen...,"<table class=""ltx_tabular ltx_guessed_headers ...",[],[],extraction/2406.12243v1.json
1,S5.T2,Table 2: Statistics of the datasets.,"<table class=""ltx_tabular ltx_guessed_headers ...",[],['Our model’s evaluation was conducted on dive...,extraction/2406.12243v1.json
2,S5.T3,Table 3: Overall performance comparison betwe...,"<table class=""ltx_tabular ltx_guessed_headers ...",[],['We have assessed the recommendation performa...,extraction/2406.12243v1.json
3,S4.T1,Table 1. Statistical Information of Adopted Da...,"<table class=""ltx_tabular ltx_guessed_headers ...",[],['Statistical information of the three dataset...,extraction/2406.10244v1.json
4,S4.T2,Table 2. Overall performance comparison betwee...,"<table class=""ltx_tabular ltx_guessed_headers ...",['Recommendation performance of GLINT-RU and e...,"['In this subsection, we compare the performan...",extraction/2406.10244v1.json
...,...,...,...,...,...,...
2473,A4.T7,Table 7. Configurations,"<table class=""ltx_tabular ltx_centering ltx_al...",[],"['Model Configurations\nFor all baselines, we ...",extraction/2408.06966v1.json
2474,A4.T8,Table 8. Sequence Length Settings,"<table class=""ltx_tabular ltx_centering ltx_al...",[],"['Then, we perform the grid search to find the...",extraction/2408.06966v1.json
2475,A5.T9,Table 9. AUC-ROC for transductive dynamic link...,"<table class=""ltx_tabular ltx_align_middle"" id...",[],['We show the AUC-ROC for transductive dynamic...,extraction/2408.06966v1.json
2476,A8.T10,Table 10. AP for inductive dynamic link predic...,"<table class=""ltx_tabular ltx_align_middle"" id...",[],['Performance on Dynamic Link Prediction.\nTo ...,extraction/2408.06966v1.json


# Preprocessing
splitting the data into 2 dataframes:
- tables
- tables_columns

In [4]:
import re
from lxml import html
from nltk.stem import PorterStemmer
from unidecode import unidecode

def get_table_columns(html_input):
    # Clean the HTML input
    html_input = html_input.replace('\n', '')
    tree = html.fromstring(html_input)

    # Extract the first row (header)
    header_row = tree.xpath('//tr[1]')

    if header_row:
        columns = []
        # Find all cells (<th> or <td>) in the first row
        cells = header_row[0].xpath('.//th | .//td')

        for cell in cells:
            # Extract regular text or the alternative text from <math> elements
            cell_text = ''.join(cell.xpath('.//text()')).strip()
            alttext = cell.xpath('.//math/@alttext')

            if alttext:
                cell_text = alttext[0]

            # Add the cell text to columns if it's not empty
            if cell_text:
                columns.append(clean_text(cell_text))
        return columns
    else:
        return []

def get_table_rows(html_input):
    # Analizza l'input HTML
    tree = html.fromstring(html_input)
    
    # Utilizza XPath per estrarre le righe della tabella
    rows = []
    for row in tree.xpath('//tr')[1:]:
        # Estrae il testo da tutte le celle (sia th che td) in una riga
        cells = row.xpath('.//th//text() | .//td//text()')

        # Aggiunge la riga pulita alla lista delle righe
        if cells:
            rows.append(clean_text(cells))
    
    return rows

def clean_text(text):
    # Remove leading and trailing whitespaces
    #cast to string
    text = str(text)

    text = text.strip()
    
    # Remove HTML tags and attributes
    text = re.sub(r'<[^>]*>', '', text)
    
    # De-accent
    text = unidecode(text)
    
    # sub dash underscore  with pipe
    text = re.sub(r'[-_]', '|', text)
    
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Stemming
    stemmer = PorterStemmer()
    text = '|'.join([stemmer.stem(word) for word in text.split()])
    
    return text

def get_id_file(path):
    file = path.split('/')[-1]
    return '.'.join(file.split('.')[:2])



In [5]:
# df_tables is the dataframe that contains the tables df

df_tables = df.copy()

#clean the id_file column
df_tables['id_file'] = df_tables['id_file'].apply(get_id_file)

df_tables['columns'] = df_tables['table'].apply(lambda x: get_table_columns(x))
df_tables['n_columns'] = df_tables['columns'].apply(len)

df_tables['rows'] = df_tables['table'].apply(lambda x: get_table_rows(x))
df_tables['n_rows'] = df_tables['rows'].apply(len)

df_tables['n_cells'] = df_tables['n_columns'] * df_tables['n_rows']
df_tables = df_tables[df_tables['n_cells'] > 0]

df_tables['caption'] = df_tables['caption'].apply(clean_text)
df_tables['caption_len'] = df_tables['caption'].apply(lambda x: len(x) if type(x) == str else 0)

df_tables['n_references'] = df_tables['references'].apply(lambda x: len([item for item in x[1:-1].split(', ') if item]) if isinstance(x, str) else 0)
df_tables['references'] = df_tables['references'].apply(lambda x: [clean_text(y) for y in x[1:-1].split(', ')] if isinstance(x, str) else [])

df_tables['n_footnotes'] = df_tables['footnotes'].apply(lambda x: len([item for item in x[1:-1].split(', ') if item]) if isinstance(x, str) else 0)
df_tables['footnotes'] = df_tables['footnotes'].apply(lambda x: [clean_text(y) for y in x[1:-1].split(', ')] if isinstance(x, str) else [])

df_tables = df_tables.drop(columns=['table'])

df_tables

Unnamed: 0,id,caption,footnotes,references,id_file,columns,n_columns,rows,n_rows,n_cells,caption_len,n_references,n_footnotes
0,S5.T1,tabl|1|instruct|tune|data|with|differ|rec|task...,[],[],2406.12243v1,[task|recommend|base|on|user|interest|and|role...,1,[n|n|task|recommend|base|on|domain|focusn|inst...,3,3,65,0,0
1,S5.T2,tabl|2|statist|of|the|dataset,[],[our|model|evalu|wa|conduct|on|divers|dataset|...,2406.12243v1,"[dataset, languag, user, new, click, new|inform]",6,[mind|22|n|english|1000000|161013|24155470|tit...,3,18,29,5,0
2,S5.T3,tabl|3|overal|perform|comparison|between|the|b...,[],[we|have|assess|the|recommend|perform|of|cherr...,2406.12243v1,"[method, mind, yahoo, adressa]",4,[r|r|bmr|bolditalicr|2|r|r|bmr|bolditalicr|2|r...,2,8,186,6,0
3,S4.T1,tabl|1|statist|inform|of|adopt|dataset,[],[statist|inform|of|the|three|dataset|is|shown|...,2406.10244v1,"[dataset, user, item, interact, avglength, spa...",6,"[ml1m|6041|3707|1000209|16560|9553, beauti|223...",3,18,38,24,0
4,S4.T2,tabl|2|overal|perform|comparison|between|glint...,[recommend|perform|of|glintru|and|exist|stateo...,"[in|thi|subsect, we|compar|the|perform|of|glin...",2406.10244v1,"[model, ml1m, amazonbeauti, amazonvideogam]",4,[recall10|mrr10|ndcg10|recall10|mrr10|ndcg10|r...,9,36,60,9,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,A4.T7,tabl|7|configur,[],"[model|configurationsnfor|all|baselin, we|foll...",2408.06966v1,"[configur, set]",2,[xa0xa0xa0xa0xa0xa0learn|rate|xa0xa0xa0xa0xa0x...,12,24,15,6,0
2474,A4.T8,tabl|8|sequenc|length|set,[],"[then, we|perform|the|grid|search|to|find|the|...",2408.06966v1,"[dataset, sequenc|length]",2,[xa0xa0xa0xa0xa0xa0xa0xa0xa0wikipedia|xa0xa0xa...,12,24,25,11,0
2475,A5.T9,tabl|9|aucroc|for|transduct|dynam|link|predict...,[],[we|show|the|aucroc|for|transduct|dynam|link|p...,2408.06966v1,"[nss, dataset, jodi, dyrep, tgat, tgn, cawn, e...",13,[rnd|wikipedia|9633|plusorminu|pm|007|9437|plu...,39,507,95,5,0
2476,A8.T10,tabl|10|ap|for|induct|dynam|link|predict|with|...,[],[perform|on|dynam|link|predictionnto|demonstr|...,2408.06966v1,"[nss, dataset, jodi, dyrep, tgat, tgn, cawn, t...",12,[rnd|wikipedia|9482|plusorminu|pm|020|9243|plu...,39,468,89,10,0


In [6]:
# df_tables_columns is from parsing html of th df['table'] column
# df_tables_columns structure: table_id, column_name, file_id

df_tables_columns = pd.DataFrame(columns=['table_id', 'column_name', 'id_file'])

for index, row in df.iterrows():
    columns = get_table_columns(row['table'])
    for col in columns:
        df_tables_columns = pd.concat([df_tables_columns, pd.DataFrame({
            'table_id': row['id'],
            'column_name': [col],
            'id_file': [get_id_file(row['id_file'])]
        })])
    
df_tables_columns



Unnamed: 0,table_id,column_name,id_file
0,S5.T1,task|recommend|base|on|user|interest|and|role|...,2406.12243v1
0,S5.T2,dataset,2406.12243v1
0,S5.T2,languag,2406.12243v1
0,S5.T2,user,2406.12243v1
0,S5.T2,new,2406.12243v1
...,...,...,...
0,A8.T11,tcl,2408.06966v1
0,A8.T11,graphmix,2408.06966v1
0,A8.T11,freedyg,2408.06966v1
0,A8.T11,dygform,2408.06966v1


In [7]:
# find column name with most length
df_tables_columns['column_name_len'] = df_tables_columns['column_name'].apply(lambda x: len(x))
df_tables_columns

Unnamed: 0,table_id,column_name,id_file,column_name_len
0,S5.T1,task|recommend|base|on|user|interest|and|role|...,2406.12243v1,274
0,S5.T2,dataset,2406.12243v1,7
0,S5.T2,languag,2406.12243v1,7
0,S5.T2,user,2406.12243v1,4
0,S5.T2,new,2406.12243v1,3
...,...,...,...,...
0,A8.T11,tcl,2408.06966v1,3
0,A8.T11,graphmix,2408.06966v1,8
0,A8.T11,freedyg,2408.06966v1,7
0,A8.T11,dygform,2408.06966v1,7


In [8]:
# Save the dataframes to csv files
df_tables.to_csv('../../dataset/tables_clean.csv', index=False)
df_tables_columns.to_csv('../../dataset/tables_columns_clean.csv', index=False)