In [1]:
import ratransformers
import pandas as pd
from transformers import BartTokenizer


ratransformer = ratransformers.RATransformer(
    "nielsr/tapex-large-finetuned-tabfact", 
    relation_kinds=['is_value_of_column', 'is_from_same_row'],
)
model = ratransformer.model
tokenizer = ratransformer.tokenizer

# create table
data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
table = pd.DataFrame.from_dict(data)

# turn into dict
table_dict = {"header": list(table.columns), "rows": [list(row.values) for i,row in table.iterrows()]}

In [2]:
table

Unnamed: 0,Actors,Number of movies
0,Brad Pitt,87
1,Leonardo Di Caprio,53
2,George Clooney,69


In [3]:
table_dict

{'header': ['Actors', 'Number of movies'],
 'rows': [['Brad Pitt', '87'],
  ['Leonardo Di Caprio', '53'],
  ['George Clooney', '69']]}

In [4]:
from collections import defaultdict 
import itertools


class IndexedRowTableLinearize:
    # adapted from https://github.com/microsoft/Table-Pretraining/blob/main/tapex/processor/table_linearize.py
    """
    FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
    """

    def process_input(self, sentence, table_content):
        """
        Given a sentence+ table, converts it into a flatten sequence with special symbols.
        Also returns the word relations
        """
        assert "header" in table_content and "rows" in table_content
        
        input_text = sentence
        word_relations = defaultdict(dict)
        
        # process header
        input_text += "col : "
        col_id_to_span = {}
        for i, col in enumerate(table_content["header"]):
            col_id_to_span[i] = (len(input_text), len(input_text) + len(col))
            input_text += f"{col} | " 
                    
        # process rows
        for row_index, row in enumerate(table_content["rows"]):
            input_text += f"row {row_index + 1} : "
            
            all_cell_spans = []
            for i, cell_value in enumerate(row):
                cell_value = str(cell_value)
                cell_span = (len(input_text), len(input_text) + len(cell_value))
                all_cell_spans.append(cell_span)
                
                # save word relation - row value belong to specific column
                word_relations[cell_span][col_id_to_span[i]] = "is_value_of_column"

                input_text += f"{cell_value} | "
                
            # save word relation - all values belong to same row
            for (span_i, span_j) in itertools.permutations(all_cell_spans, 2):
                 word_relations[span_i][span_j] = "is_from_same_row"
        
        if input_text.endswith(' | '): # remove trailing characters
            input_text = input_text[:-len(' | ')]
        
        return input_text, word_relations

In [5]:
linearizer = IndexedRowTableLinearize()

sentence = "George Clooney has 69 movies"
joint_input, word_relations = linearizer.process_input(sentence, table_dict)

# encode 
encoding = tokenizer(joint_input, return_tensors="pt", input_relations=word_relations)

# forward pass
outputs = model(**encoding)

# print prediction
logits = outputs.logits
print(logits.argmax(-1))

tensor([0])


In [6]:
joint_input

'George Clooney has 69 moviescol : Actors | Number of movies | row 1 : Brad Pitt | 87 | row 2 : Leonardo Di Caprio | 53 | row 3 : George Clooney | 69'

In [7]:
word_relations

defaultdict(dict,
            {(70, 79): {(34, 40): 'is_value_of_column',
              (82, 84): 'is_from_same_row'},
             (82, 84): {(43, 59): 'is_value_of_column',
              (70, 79): 'is_from_same_row'},
             (95, 113): {(34, 40): 'is_value_of_column',
              (116, 118): 'is_from_same_row'},
             (116, 118): {(43, 59): 'is_value_of_column',
              (95, 113): 'is_from_same_row'},
             (129, 143): {(34, 40): 'is_value_of_column',
              (146, 148): 'is_from_same_row'},
             (146, 148): {(43, 59): 'is_value_of_column',
              (129, 143): 'is_from_same_row'}})

**Your model is now ready to be trained with relational information in the input!**

Check the standard procedure to train HuggingFace 🤗 models in [here](https://huggingface.co/docs/transformers/training).