In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Load HiTab data
with open("HiTab/dev_samples.jsonl", "r") as file:
    data = [json.loads(line) for line in file]

# Extract hierarchies and table cells
table = data[0]  

In [3]:
table

{'id': 'e3ea71e57afb31f0b257426444523c3e',
 'table_id': '0_1_nsf21326-tab001',
 'table_source': 'nsf',
 'sentence_id': '1',
 'sub_sentence_id': '1',
 'sub_sentence': 'if the pre-production development activities were to be included, the fy 2017 r&d budget authority would have been $155.0 billion instead of the $125.3 billion in actual budget authority.',
 'question': 'if the pre-production development activities were to be included, how many dollars would the fy 2017 r&d budget authority have been?',
 'answer': [154983.0],
 'aggregation': ['none'],
 'linked_cells': {'entity_link': {'top': {'the fy 2017 r&d budget': {'(0, 1)': '2017 actual'}},
   'left': {'pre-production development activities': {'(18, 0)': 'total'}},
   'top_left_corner': {}},
  'quantity_link': {'125.3 billion': {'(17, 1)': 125289.0},
   '[ANSWER]': {'(18, 1)': 154983.0}}},
 'answer_formulas': ['=B21'],
 'reference_cells_map': {'B21': '(18, 1)'}}

In [14]:
from sentence_transformers import SentenceTransformer

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Sample data

def encode_text(text):
    return model.encode(text)

def serialize_dict(d):
    return json.dumps(d, sort_keys=True)

def serialize_list(lst):
    return ', '.join(map(str, lst))

def get_embeddings(table):

    # Encoding different components
    metadata_keys = ['id', 'table_id', 'table_source', 'sentence_id', 'sub_sentence_id']
    metadata_text = " | ".join([f"{key}: {table[key]}" for key in metadata_keys])

    sub_sentence_embedding = encode_text(table['sub_sentence'])
    question_embedding = encode_text(table['question'])

    # Serialize and encode linked_cells
    linked_cells_serialized = serialize_dict(table['linked_cells'])
    linked_cells_embedding = encode_text(linked_cells_serialized)

    # Serialize and encode formulas and references
    formulas_serialized = serialize_list(table['answer_formulas'])
    formulas_embedding = encode_text(formulas_serialized)

    references_serialized = serialize_dict(table['reference_cells_map'])
    references_embedding = encode_text(references_serialized)

    # Serialize and encode metadata
    metadata_embedding = encode_text(metadata_text)

    # Encode answers and aggregations (if applicable)
    # For numerical answers, you might normalize or embed them differently
    # Here, we'll serialize them as strings
    answers_serialized = serialize_list(table['answer'])
    answers_embedding = encode_text(answers_serialized)

    aggregations_serialized = serialize_list(table['aggregation'])
    aggregations_embedding = encode_text(aggregations_serialized)

    # Combine all embeddings
    # Simple concatenation or averaging can be used; more sophisticated methods can be applied
    combined_embedding = np.array([
        metadata_embedding,
        sub_sentence_embedding,
        question_embedding,
        linked_cells_embedding,
        formulas_embedding,
        references_embedding,
        answers_embedding,
        aggregations_embedding
    ])

    return combined_embedding
    # print("Combined Embedding Shape:", combined_embedding.shape)

def encode_data(data):
    emb_tensor = np.stack([get_embeddings(item) for item in data], axis=0)
    return emb_tensor

In [16]:
X = encode_data(data)

In [18]:
np.savez('compressedHiTab.npz', X)