In [2]:
import json
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from torch.utils.data import Dataset
import torch

In [3]:
import json
import pandas as pd

class Header:
    def __init__(self, names: list, types: list):
        self.names = names
        self.types = types

    def __getitem__(self, idx):
        return self.names[idx], self.types[idx]

    def __len__(self):
        return len(self.names)

    def __repr__(self):
        return ' | '.join(['{}({})'.format(n, t) for n, t in zip(self.names, self.types)])

class Table:
    def __init__(self, id, name, title, header: Header, rows, **kwargs):
        self.id = id
        self.name = name
        self.title = title
        self.header = header
        self.rows = rows
        self._df = None

    @property
    def df(self):
        if self._df is None:
            self._df = pd.DataFrame(data=self.rows,
                                    columns=self.header.names,
                                    dtype=str)
        return self._df

    def _repr_html_(self):
        return self.df._repr_html_()

class Tables:
    table_dict = None

    def __init__(self, table_list: list = None, table_dict: dict = None):
        self.table_dict = {}
        if isinstance(table_list, list):
            for table in table_list:
                self.table_dict[table.id] = table
        if isinstance(table_dict, dict):
            self.table_dict.update(table_dict)

    def push(self, table):
        self.table_dict[table.id] = table

    def __len__(self):
        return len(self.table_dict)

    def __add__(self, other):
        return Tables(
            table_list=list(self.table_dict.values()) +
            list(other.table_dict.values())
        )

    def __getitem__(self, id):
        return self.table_dict[id]

    def __iter__(self):
        for table_id, table in self.table_dict.items():
            yield table_id, table

class Question:
    def __init__(self, text):
        self.text = text

    def __repr__(self):
        return self.text

    def __getitem__(self, idx):
        return self.text[idx]

    def __len__(self):
        return len(self.text)

class SQL:
    op_sql_dict = {0: ">", 1: "<", 2: "==", 3: "!="}
    agg_sql_dict = {0: "", 1: "AVG", 2: "MAX", 3: "MIN", 4: "COUNT", 5: "SUM"}
    conn_sql_dict = {0: "NULL", 1: "AND", 2: "OR"}

    def __init__(self, cond_conn_op: int, agg: list, sel: list, conds: list, **kwargs):
        self.cond_conn_op = cond_conn_op
        self.sel = []
        self.agg = []
        sel_agg_pairs = zip(sel, agg)
        sel_agg_pairs = sorted(sel_agg_pairs, key=lambda x: x[0])
        for col_id, agg_op in sel_agg_pairs:
            self.sel.append(col_id)
            self.agg.append(agg_op)
        self.conds = sorted(conds, key=lambda x: x[0])

    @classmethod
    def from_dict(cls, data: dict):
        return cls(**data)

    def keys(self):
        return ['cond_conn_op', 'sel', 'agg', 'conds']

    def __getitem__(self, key):
        return getattr(self, key)

    def to_json(self):
        return json.dumps(dict(self), ensure_ascii=False, sort_keys=True)

    def equal_all_mode(self, other):
        return self.to_json() == other.to_json()

    def equal_agg_mode(self, other):
        self_sql = SQL(cond_conn_op=0, agg=self.agg, sel=self.sel, conds=[])
        other_sql = SQL(cond_conn_op=0, agg=other.agg, sel=other.sel, conds=[])
        return self_sql.to_json() == other_sql.to_json()

    def equal_conn_and_agg_mode(self, other):
        self_sql = SQL(cond_conn_op=self.cond_conn_op,
                       agg=self.agg,
                       sel=self.sel,
                       conds=[])
        other_sql = SQL(cond_conn_op=other.cond_conn_op,
                        agg=other.agg,
                        sel=other.sel,
                        conds=[])
        return self_sql.to_json() == other_sql.to_json()

    def equal_no_val_mode(self, other):
        self_sql = SQL(cond_conn_op=self.cond_conn_op,
                       agg=self.agg,
                       sel=self.sel,
                       conds=[cond[:2] for cond in self.conds])
        other_sql = SQL(cond_conn_op=other.cond_conn_op,
                        agg=other.agg,
                        sel=other.sel,
                        conds=[cond[:2] for cond in other.conds])
        return self_sql.to_json() == other_sql.to_json()

    def __eq__(self, other):
        raise NotImplementedError('compare mode not set')

    def __repr__(self):
        repr_str = ''
        repr_str += "sel: {}\n".format(self.sel)
        repr_str += "agg: {}\n".format([self.agg_sql_dict[a]
                                        for a in self.agg])
        repr_str += "cond_conn_op: '{}'\n".format(
            self.conn_sql_dict[self.cond_conn_op])
        repr_str += "conds: {}".format(
            [[cond[0], self.op_sql_dict[cond[1]], cond[2]] for cond in self.conds])

        return repr_str

    def _repr_html_(self):
        return self.__repr__().replace('\n', '<br>')

class Query:
    def __init__(self, question: Question, table: Table, sql: SQL = None):
        self.question = question
        self.table = table
        self.sql = sql

    def _repr_html_(self):
        repr_str = '{}<br>{}<br>{}'.format(
            self.table._repr_html_(),
            self.question.__repr__(),
            self.sql._repr_html_() if self.sql is not None else ''
        )
        return repr_str

def read_tables(table_file):
    tables = Tables()
    with open(table_file, encoding='utf-8') as f:
        for line in f:
            tb = json.loads(line)
            header = Header(tb.pop('header'), tb.pop('types'))
            table = Table(header=header, **tb)
            tables.push(table)
    return tables

def read_data(data_file, tables: Tables):
    queries = []
    with open(data_file, encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            question = Question(text=data['question'])
            table = tables[data['table_id']]
            if 'sql' in data:
                sql = SQL.from_dict(data['sql'])
            else:
                sql = None
            query = Query(question=question, table=table, sql=sql)
            queries.append(query)
    return queries


In [4]:
train_table_file = '../../TableQA-master/train/train.tables.json'
train_data_file = '../../TableQA-master/train/train.json'

In [5]:
train_tables = read_tables(train_table_file)
train_data = read_data(train_data_file, train_tables)

In [6]:
print(len(train_tables))

5013


In [7]:
test_table_file = '../../TableQA-master/test/test.tables.json'
test_data_file = '../../TableQA-master/test/test.json'

test_tables = read_tables(test_table_file)
test_data = read_data(test_data_file, test_tables)

In [8]:
test_data[0].sql is None

True

In [9]:
train_data[0]

Unnamed: 0,影片名称,周票房（万）,票房占比（%）,场均人次
0,死侍2：我爱我家,10637.3,25.8,5.0
1,白蛇：缘起,10503.8,25.4,7.0
2,大黄蜂,6426.6,15.6,6.0
3,密室逃生,5841.4,14.2,6.0
4,“大”人物,3322.9,8.1,5.0
5,家和万事惊,635.2,1.5,25.0
6,钢铁飞龙之奥特曼崛起,595.5,1.4,3.0
7,海王,500.3,1.2,5.0
8,一条狗的回家路,360.0,0.9,4.0
9,掠食城市,356.6,0.9,3.0


In [10]:
str(train_data[0].question)

'二零一九年第四周大黄蜂和密室逃生这两部影片的票房总占比是多少呀'

In [11]:
idx = 333
train_data[idx].sql


In [12]:
train_data[idx].sql.conn_sql_dict[train_data[idx].sql.cond_conn_op]

'NULL'

In [13]:
train_data[idx].sql.conn_sql_dict[train_data[idx].sql.cond_conn_op]+'-'+str(len(train_data[idx].sql.conds))

'NULL-1'

In [14]:
len(train_data[idx].sql.conds)

1

In [15]:
train_data[222].table

Unnamed: 0,时间,锂电池需求量,锂电池YoY,三元电池需求量,三元电池YoY,磷酸铁锂&钴酸锂需求量,磷酸铁锂&钴酸锂YoY
0,2016A,28.3,--,7.5,--,20.8,--
1,2017A,36.2,27.8%,15.8,111.9%,20.4,-2.3%
2,2018E,48.3,33.6%,27.5,74.2%,20.8,2.1%
3,2019E,73.8,52.7%,52.2,89.5%,21.6,4.0%
4,2020E,110.0,49.1%,88.6,69.9%,21.4,-1.2%


In [16]:
train_data[0].table.header

影片名称(text) | 周票房（万）(real) | 票房占比（%）(real) | 场均人次(real)

In [17]:
train_data[0].table.header[0]

('影片名称', 'text')

In [18]:
len(train_data)

41522

In [19]:
header_sent = ''
for header, type in train_data[0].table.header:
    header_sent+=header
    header_sent+='|'
header_sent = header_sent[:-1]
header_sent  # use it as sent2 in BertTokenizer, then modifiy the '|' id in token_ids

'影片名称|周票房（万）|票房占比（%）|场均人次'

In [20]:
res_list = [i for i, value in enumerate(header_sent) if value == '|']
res_list

[4, 11, 19]

In [21]:

import numpy as np
import torch
from transformers import BertTokenizer
from torch.utils.data import Dataset

import config

class CustomDataset(Dataset):

    # customized dataset
    # implement ___len___ & __getitem__ function

    def __init__(self, data, has_label=True, max_len=config.MAX_LEN, model_name=config.BASE_MODEL_PATH, SEP_temp='|', cls_token='[unused8]'):
        self.data = data                                                                     # loaded data
        self.max_len = max_len                                                               # max length of sequence
        self.tokenizer = BertTokenizer.from_pretrained(model_name, cls_token=cls_token)      # cls_token will be replaced by a unused token in bert vocab
        self.indexes = np.arange(len(self.data))                                             # set a list of indexes according to data length
        self.has_label = has_label                                                           # bool, input data contains labels or not
        self.SEP_temp = SEP_temp                                                             # temporarily seperate headers, len(SEP_temp) should be 1, then will be replaced by [SEP]'s token id 
        self.SEP_temp_id = self.tokenizer.encode(SEP_temp)[1]                                # SEP_temp's token id 
        self.SEP_id = self.tokenizer.encode('[SEP]')[1]                                      # SEP's token id 
        self.XLS_id = self.tokenizer.encode(cls_token)[1]                                    # XLS_temp's token id 


    def __len__(self):  
        return len(self.data) 

    def __getitem__(self, idx): 
        
        question = str(self.data[idx].question)

        # construct header in str
        header_sent = ''
        for header, _ in self.data[idx].table.header:
            header_sent+=header
            header_sent+=self.SEP_temp
        header_sent = header_sent[:-1]                # drop the SEP_temp at tail

        # print(question)
        # print(header_sent)

        embeddings = self.tokenizer(
            question, header_sent,                    # sentence 1, sentence 2
            padding='max_length',                     # Pad to max_length
            truncation=True,                          # Truncate to max_length
            max_length=self.max_len,                  # Set max_length
            return_tensors='pt'                       # Return torch.Tensor objects
        )

        token_ids = torch.squeeze(embeddings['input_ids'])                                             # tensor of token ids
        token_ids = torch.where(token_ids==self.SEP_temp_id, torch.tensor(self.SEP_id), token_ids)     # replace SEP_temp_id by SEP_id

        attention_masks = torch.squeeze(embeddings['attention_mask'])    # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = torch.squeeze(embeddings['token_type_ids'])     # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        SEP_position_list = [i for i, value in enumerate(token_ids) if value == self.SEP_id]


        # True if the dataset has labels (when training or validating or testing)
        if self.has_label:                                               
            W_num_op_label = self.data[idx].sql.conn_sql_dict[self.data[idx].sql.cond_conn_op] + '-' + str(len(self.data[idx].sql.conds)) 
            S_num_label = len(self.data[idx].sql.sel)

            return {
                ## X
                'token_ids': token_ids,
                'token_type_ids': token_type_ids,
                'attention_masks': attention_masks,
                'SEP_position_list': torch.tensor(SEP_position_list),

                ## y
                'S_num': torch.tensor(config.S_num_label2id[S_num_label]),              
                'W_num_op': torch.tensor(config.W_num_op_label2id[W_num_op_label])
            } 
        # False if the dataset do not have labels (when inferencing)
        else:                                                           
            return {
                'token_ids': token_ids,
                'token_type_ids': token_type_ids,
                'attention_masks': attention_masks,
                'SEP_position_list': torch.tensor(SEP_position_list)
            }
            

In [22]:
train_set = CustomDataset(train_data)

In [23]:
val_tables = read_tables(config.val_table_file)
val_data = read_data(config.val_data_file, val_tables)
val_set = CustomDataset(val_data)


test_tables = read_tables(config.test_table_file)
test_data = read_data(config.test_data_file, test_tables)
test_set = CustomDataset(test_data,has_label=False )

In [24]:

import numpy as np
import torch
from transformers import BertTokenizer
from torch.utils.data import Dataset

import config

class CustomDataset2(Dataset):

    # customized dataset
    # implement ___len___ & __getitem__ function

    def __init__(self, data, has_label=True, max_len=config.MAX_LEN, model_name=config.BASE_MODEL_PATH, SEP_temp='|', cls_token='[unused8]'):
        self.data = data                                                                     # loaded data
        self.max_len = max_len                                                               # max length of sequence
        self.tokenizer = BertTokenizer.from_pretrained(model_name, cls_token=cls_token)      # cls_token will be replaced by a unused token in bert vocab
        self.indexes = np.arange(len(self.data))                                             # set a list of indexes according to data length
        self.has_label = has_label                                                           # bool, input data contains labels or not
        self.SEP_temp = SEP_temp                                                             # temporarily seperate headers, len(SEP_temp) should be 1, then will be replaced by [SEP]'s token id 
        self.SEP_temp_id = self.tokenizer.encode(SEP_temp)[1]                                # SEP_temp's token id 
        self.SEP_id = self.tokenizer.encode('[SEP]')[1]                                      # SEP's token id 
        self.XLS_id = self.tokenizer.encode(cls_token)[1]                                    # XLS_temp's token id 


    def __len__(self):  
        return len(self.data) 

    def __getitem__(self, idx): 
        
        question = str(self.data[idx].question)

        # construct header in str
        header_sent = ''
        for header, _ in self.data[idx].table.header:
            header_sent+=header
            header_sent+=self.SEP_temp
        header_sent = header_sent[:-1]                # drop the SEP_temp at tail

        # print(question)
        # print(header_sent)

        embeddings = self.tokenizer(
            question, header_sent,                    # sentence 1, sentence 2
            padding='max_length',                     # Pad to max_length
            truncation=True,                          # Truncate to max_length
            max_length=self.max_len,                  # Set max_length
            return_tensors='pt'                       # Return torch.Tensor objects
        )

        token_ids = torch.squeeze(embeddings['input_ids'])                                             # tensor of token ids
        token_ids = torch.where(token_ids==self.SEP_temp_id, torch.tensor(self.SEP_id), token_ids)     # replace SEP_temp_id by SEP_id

        attention_masks = torch.squeeze(embeddings['attention_mask'])    # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = torch.squeeze(embeddings['token_type_ids'])     # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens


        # True if the dataset has labels (when training or validating or testing)
        if self.has_label:                                               
            W_num_op_label = self.data[idx].sql.conn_sql_dict[self.data[idx].sql.cond_conn_op] + '-' + str(len(self.data[idx].sql.conds)) 
            S_num_label = len(self.data[idx].sql.sel)

            return {
                ## X
                'token_ids': token_ids,
                'token_type_ids': token_type_ids,
                'attention_masks': attention_masks,

                ## y
                'S_num': torch.tensor(config.S_num_label2id[S_num_label]),              
                'W_num_op': torch.tensor(config.W_num_op_label2id[W_num_op_label])
            } 
        # False if the dataset do not have labels (when inferencing)
        else:                                                           
            return {
                'token_ids': token_ids,
                'token_type_ids': token_type_ids,
                'attention_masks': attention_masks,
            }
            

In [1]:
a = [1,2,3,4,13,4,3]
a[1:3]

[2, 3]

In [35]:
def collate_fn(batch_data):

    batch_data.sort(key=lambda xi: len(xi['SEP_position_list']), reverse=True)
    
    SEP_position_list_seq = [xi['SEP_position_list'] for xi in batch_data]

    token_ids_seq = [xi['token_ids'] for xi in batch_data]
    token_type_ids_seq = [xi['token_type_ids'] for xi in batch_data]
    attention_masks_seq = [xi['attention_masks'] for xi in batch_data]
    padded_SEP_position_list_seq = torch.nn.utils.rnn.pad_sequence(SEP_position_list_seq, batch_first=True, padding_value=0)

    S_num_seq = [xi['S_num'] for xi in batch_data]
    W_num_op_seq = [xi['W_num_op'] for xi in batch_data]

    return {
        'token_ids': token_ids_seq, 
        'token_type_ids': token_type_ids_seq,
        'attention_masks': attention_masks_seq,
        'SEP_position_list': padded_SEP_position_list_seq,
        'S_num': S_num_seq,
        'W_num_op': W_num_op_seq
    }

    

def collate_fn_labelless(batch_data):

    batch_data.sort(key=lambda xi: len(xi['SEP_position_list']), reverse=True)
    
    SEP_position_list_seq = [xi['SEP_position_list'] for xi in batch_data]

    token_ids_seq = [xi['token_ids'] for xi in batch_data]
    token_type_ids_seq = [xi['token_type_ids'] for xi in batch_data]
    attention_masks_seq = [xi['attention_masks'] for xi in batch_data]
    padded_SEP_position_list_seq = torch.nn.utils.rnn.pad_sequence(SEP_position_list_seq, batch_first=True, padding_value=0)

    return {
        'token_ids': token_ids_seq, 
        'token_type_ids': token_type_ids_seq,
        'attention_masks': attention_masks_seq,
        'SEP_position_list': padded_SEP_position_list_seq
    }

In [36]:
from torch.utils.data import DataLoader

val_set2 = CustomDataset2(val_data)
val_dataloader2 = DataLoader(
    dataset=val_set2, 
    batch_size=2, 
    shuffle=True, 
    num_workers=1,
    pin_memory=True,
)


In [37]:
from torch.utils.data import DataLoader

val_dataloader = DataLoader(
    dataset=val_set, 
    batch_size=2, 
    shuffle=True, 
    num_workers=1,
    pin_memory=True,
    collate_fn=collate_fn
)

test_dataloader = DataLoader(
    dataset=test_set, 
    batch_size=2, 
    shuffle=True, 
    num_workers=1,
    pin_memory=True,
    collate_fn=collate_fn_labelless
)

In [45]:
for data in val_dataloader:
    print(data['S_num'])
    print(torch.stack(data['S_num']))
    break

[tensor(0), tensor(0)]
tensor([0, 0])


In [46]:
for data in val_dataloader2:
    print(data['S_num'])
    break

tensor([0, 0])


In [None]:
idx = 222

In [None]:
train_set[idx]['SEP_position_list']

tensor([34, 37, 44, 50, 58, 65, 77, 88])

In [None]:
token_ids = train_set[idx]['token_ids']
res_list = [i for i, value in enumerate(token_ids) if value == 102]
res_list[1:]

[37, 44, 50, 58, 65, 77, 88]

In [None]:
train_data[idx].table

Unnamed: 0,时间,锂电池需求量,锂电池YoY,三元电池需求量,三元电池YoY,磷酸铁锂&钴酸锂需求量,磷酸铁锂&钴酸锂YoY
0,2016A,28.3,--,7.5,--,20.8,--
1,2017A,36.2,27.8%,15.8,111.9%,20.4,-2.3%
2,2018E,48.3,33.6%,27.5,74.2%,20.8,2.1%
3,2019E,73.8,52.7%,52.2,89.5%,21.6,4.0%
4,2020E,110.0,49.1%,88.6,69.9%,21.4,-1.2%


In [None]:
train_data[idx].sql

In [None]:
indices = torch.tensor([2,4,8])    # SEP list
params  = torch.randn(10)          # params
params

#  XLS, a, SEP, b, SEP, c, c, c, SEP, pad, pad
#  0  , 1, 2  , 3, 4  , 5, 6, 7, 8  , 9  , 10

tensor([-0.6639,  1.0065,  1.7779, -1.3565, -0.0894,  1.0683,  1.1525,  0.6744,
        -0.3458,  0.1946])

In [None]:
question_positions = params[1:indices[0]+1]
question_positions

tensor([1.0065, 1.7779])

In [None]:
result = []
for i, idx in enumerate(indices[:-1]):
    id_start = idx+1   
    id_end = indices[i+1]+1
    result.append(params[id_start:id_end])
result

[tensor([-1.3565, -0.0894]), tensor([ 1.0683,  1.1525,  0.6744, -0.3458])]

In [None]:
def get_question_column_positions(indice, params):
    question_positions = params[1:indices[0]+1]
    column_positions = []
    for i, idx in enumerate(indices[:-1]):
        id_start = idx+1   
        id_end = indices[i+1]+1
        column_positions.append(params[id_start:id_end])
    return question_positions, column_positions

In [1]:
import torch

tensor([[2],
        [4],
        [5]])

In [3]:
batch_size = 8
max_seq_len = 9
hidden_size = 6
x = torch.empty(batch_size, max_seq_len, hidden_size)
for i in range(batch_size):
  for j in range(max_seq_len):
    for k in range(hidden_size):
      x[i,j,k] = i + j*10 + k*100

In [3]:
import numpy as np
max_len = 20
idx = 5 
ones = np.ones(idx+1)
zeros = np.zeros(max_len-idx-1)
question_masks = np.concatenate([ones, zeros])
question_masks

array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [4]:
import torch 
max_len = 20
idx = 5 
ones = torch.ones(idx+1)
zeros = torch.zeros(max_len-idx-1)
question_masks = torch.cat([ones, zeros])
question_masks

tensor([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])