# Setup

## github and weights

In [None]:
%cd /content/
!rm -r /content/FinTabParse
!git clone https://github.com/MeMihir/FinTabParse.git
!cd FinTabParse/ && git checkout trans2-test

In [None]:
!wget https://files.pythonhosted.org/packages/e9/45/6c28dccc896c5e4f2b219054b6c661228861699698c4aa8737efbc928915/transformers-2.6.0.tar.gz

In [None]:
!cd FinTabParse/utils/ && pip install -e ./transformers-2.6.0/

In [None]:
! mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json
!kaggle kernels output mihirpavuskar/tat-qa-train -p /content/FinTabParse/models/

In [None]:
# !cd /content/FinTabParse/models/ && git clone https://github.com/wenhuchen/HybridQA.git
# !cd /content/FinTabParse/models/HybridQA && wget https://hybridqa.s3-us-west-2.amazonaws.com/models.zip && unzip models.zip
!cd /content/FinTabParse/models/HybridR && wget https://hybridqa.s3-us-west-2.amazonaws.com/models.zip && unzip models.zip

In [None]:
! cp /content/drive/MyDrive/Capstone/Table_Parsing_QA/models/model_f-1_e-3.pt /content/FinTabParse/weights/questions_classifier

## installs

In [None]:
!pip install tensorboardX
!pip install tqdm
!pip install fuzzywuzzy
!pip install dateparser

In [None]:
!cd ./FinTabParse/ && pip install -r requirement.txt
!pip install https://data.pyg.org/whl/torch-1.7.0%2Bcu102/torch_scatter-2.0.7-cp37-cp37m-linux_x86_64.whl

  Attempting uninstall: sentencepiece
    Found existing installation: sentencepiece 0.1.96
    Uninstalling sentencepiece-0.1.96:
      Successfully uninstalled sentencepiece-0.1.96
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.5
    Uninstalling pandas-1.3.5:
      Successfully uninstalled pandas-1.3.5
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
  Attempting uninstall: h5py
    Found existing installation: h5py 3.1.0
    Uninstalling h5py-3.1.0:
      Successfully uninstalled h5py-3.1.0
  Attempting uninstall: gast
    Found existing installation: gast 0.5.3
    Uninstalling gast-0.5.3:
      Successfully uninstalled gast-0.5.3
  Attempting uninstall: filelock
    Found existing insta

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Code

In [None]:
%cd /content/FinTabParse

/content/FinTabParse


## main.py

In [None]:
from transformers import  BertTokenizer
import torch

from utils.file_handling import file_to_list, json_to_dict, dict_to_json
from models.questions_classifier import BertClassifier
from preprocessing import tagop_preprocessing, hybridr_preprocessing

def get_inputs(questions_path, paragraphs_path, table_path):
	questions = file_to_list(questions_path)
	paragraphs = file_to_list(paragraphs_path)
	table = json_to_dict(table_path)
	return questions, paragraphs, table


def question_classifier_model(questions, model_path):
	tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 	
	question_classifier = BertClassifier()
	question_classifier.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
 
	question_infs = list(map(lambda question_input: tokenizer(question_input, padding='max_length', max_length = 512, truncation=True, return_tensors="pt"), questions))
	question_preds = []

	for que_inf in question_infs:
		question_id = que_inf['input_ids']
		question_mask = que_inf['attention_mask'].squeeze(1)
		pred = question_classifier(question_id, question_mask)
		question_preds.append(pred.argmax(dim=1).numpy()[0])
	
	return question_preds

## run code

In [None]:
!cp /content/FinTabParse/tests/test3/* /content/FinTabParse/inputs

### input and classification

In [None]:
import os

inputs_path = '/content/FinTabParse/inputs/'
paragraphs_path = os.path.join(inputs_path, 'paragraphs.txt')
questions_path = os.path.join(inputs_path, 'questions.txt')
table_path = os.path.join(inputs_path, 'table.json')

questions, paragraphs, table = get_inputs(questions_path, paragraphs_path, table_path)

In [None]:
question_preds = question_classifier_model(questions, '/content/FinTabParse/weights/questions_classifier')

In [None]:
question_preds

[1, 1, 0, 0, 1, 0, 1, 1, 0]

### hybridR

In [None]:
hybridR_questions = list(map(lambda x: x[1], filter(lambda x: x[0]==0, zip(question_preds, questions))))
hybirdR_paragraphs, hybirdR_ques, hybirdR_table = hybridr_preprocessing(table, paragraphs, hybridR_questions)

os.makedirs('models/HybridR/test_inputs/', exist_ok=True)
dict_to_json(hybirdR_ques, 'models/HybridR/test_inputs/test.json')

os.makedirs('inputs/request_tok', exist_ok=True)
dict_to_json(hybirdR_paragraphs, 'inputs/request_tok/table_0.json')

os.makedirs('inputs/tables_tok', exist_ok=True)
dict_to_json(hybirdR_table, 'inputs/tables_tok/table_0.json')

del hybirdR_ques
del hybirdR_paragraphs
del hybirdR_table

In [None]:
from models.HybridR.preprocessing import preprocessing_main
preprocessing_main()



[{'question_id': 'que_0', 'question': 'Which student got the lowest marks in FAT?', 'table_id': 'table_0', 'question_postag': 'JJ NN VBD DT JJS NNS IN NNP .'}, {'question_id': 'que_1', 'question': 'Which student has the highest total marks?', 'table_id': 'table_0', 'question_postag': 'JJ NN VBZ DT JJS JJ NNS .'}, {'question_id': 'que_2', 'question': 'How many students got higher than 50 in FAT?', 'table_id': 'table_0', 'question_postag': 'WRB JJ NNS VBD JJR IN CD IN NNP .'}, {'question_id': 'que_3', 'question': 'Which student has the highest marks in Quiz1 and Quiz2 combined?', 'table_id': 'table_0', 'question_postag': 'JJ NN VBZ DT JJS NNS IN NNP CC NNP VBD .'}]


In [None]:
!rm -r /content/FinTabParse/models/HybridR/WikiTables-WithLinks/
!mkdir /content/FinTabParse/models/HybridR/WikiTables-WithLinks/
!cp -r /content/FinTabParse/inputs/request_tok /content/FinTabParse/models/HybridR/WikiTables-WithLinks/
!cp -r /content/FinTabParse/inputs/tables_tok /content/FinTabParse/models/HybridR/WikiTables-WithLinks/
!cp -r /content/FinTabParse/inputs/tables_tmp /content/FinTabParse/models/HybridR/WikiTables-WithLinks/

In [None]:
!cd /content/FinTabParse/models/HybridR && CUDA_VISIBLE_DEVICES=0 python train_stage12.py --stage1_model stage1/2020_10_03_22_47_34/checkpoint-epoch2 --stage2_model stage2/2020_10_03_22_50_31/checkpoint-epoch2/ --do_lower_case --predict_file preprocessed_test/test_inputs.json --do_eval --option stage12 --model_name_or_path  bert-large-uncased

05/03/2022 06:59:16 - INFO - transformers2.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at /tmp/6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8
05/03/2022 06:59:16 - INFO - transformers2.configuration_utils -   Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty"

In [None]:
!cd /content/FinTabParse/models/HybridR && CUDA_VISIBLE_DEVICES=0 python train_stage3.py --model_name_or_path stage3/2020_10_03_22_51_12/checkpoint-epoch3/ --do_stage3   --do_lower_case  --predict_file predictions.intermediate.json --per_gpu_train_batch_size 12  --max_seq_length 384   --doc_stride 128 --threads 8

05/03/2022 07:00:38 - INFO - transformers2.configuration_utils -   loading configuration file stage3/2020_10_03_22_51_12/checkpoint-epoch3/config.json
05/03/2022 07:00:38 - INFO - transformers2.configuration_utils -   Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 16,
  "n

### tagOP

In [None]:
tagop_questions = list(map(lambda x: x[1], filter(lambda x: x[0]==1, zip(question_preds, questions))))
tag_op_input = tagop_preprocessing(table, paragraphs, tagop_questions)
dict_to_json([tag_op_input], '/content/FinTabParse/models/TAT-QA/dataset_tagop/tatqa_dataset_dev.json')

In [None]:
!cd /content/FinTabParse/models/TAT-QA && PYTHONPATH=$PYTHONPATH:$(pwd):$(pwd)/tag_op python tag_op/prepare_dataset.py --mode dev

==== NOTE ====: encoder:roberta, mode:dev
Reading file at %s ./dataset_tagop/tatqa_dataset_dev.json
Reading the tatqa dataset
  0% 0/1 [00:00<?, ?it/s]100% 1/1 [00:00<00:00,  8.95it/s]
{'Span-in-text': 0, 'Cell-in-table': 0, 'Spans': 0, 'Sum': 0, 'Count': 0, 'Average': 0, 'Multiplication': 0, 'Division': 0, 'Difference': 0, 'Change ratio': 0}
{'': 5, 'thousand': 0, 'million': 0, 'billion': 0, 'percent': 0}
0
Save data to ./tag_op/cache/tagop_roberta_cached_dev.pkl.


In [None]:
!cd /content/FinTabParse/models/TAT-QA && PYTHONPATH=$PYTHONPATH:$(pwd) python tag_op/predictor.py --data_dir tag_op/cache/ --test_data_dir tag_op/cache/ --save_dir tag_op/ --eval_batch_size 32 --model_path ./checkpoint --encoder roberta

Namespace(ablation_mode=0, bert_learning_rate=None, bert_weight_decay=None, cuda=True, data_dir='tag_op/cache/', encoder='roberta', eval_batch_size=32, gpu_num=1, log_file='train.log', mode=1, model_path='./checkpoint', op_mode=0, roberta_model='dataset_tagop/roberta.large', save_dir='tag_op/', test_data_dir='tag_op/cache/')
tag_op/cache/tagop_roberta_cached_dev.pkl
Load data from tagop_roberta_cached_dev.pkl.
Load data size 5.
05/03/2022 07:01:59 Below are the result on Dev set...
100% 1/1 [00:01<00:00,  1.32s/it]
raw matrix:            em
answer_from   
answer_type   
             5

detail em:              em
answer_from     
answer_type     
             0.0

detail f1:              f1
answer_from     
answer_type     
             0.0

global em:0.0

global f1:0.0

global scale:0.6

global op:0.2



## Output

In [None]:
!rm -r ./outputs

In [None]:
os.makedirs('outputs', exist_ok=True)
import shutil

In [None]:
tagop_pred = json_to_dict('models/TAT-QA/tag_op/pred_result_on_dev.json')
shutil.move('models/TAT-QA/tag_op/pred_result_on_dev.json', 'outputs/tag_op_pred.json')

tagop_answers = []
for que in tag_op_input['questions']:
  tagop_answers.append({
      'id': que['uid'],
      'question': que['question'],
      'answer': f"{tagop_pred[que['uid']][0]} {tagop_pred[que['uid']][1]}"
  })
dict_to_json(tagop_answers, 'outputs/tagop_answers.json')

In [None]:
import shutil
shutil.move('models/HybridR/predictions.intermediate.json', 'outputs/')
shutil.move('models/HybridR/predictions.json', 'outputs/hybridR_predictions.json')
hybridR_pred = json_to_dict('outputs/hybridR_predictions.json')

In [None]:
hybridR_answers = []
for que in hybridR_pred:
  hybridR_answers.append({
      "question": que["question"],
      'id': que['question_id'],
      'answer': que['pred']
  })
dict_to_json(hybridR_answers, 'outputs/hybridR_answers.json')

In [None]:
!zip -r outputs.zip ./outputs/

  adding: outputs/ (stored 0%)
  adding: outputs/hybridR_answers.json (stored 0%)
  adding: outputs/predictions.intermediate.json (stored 0%)
  adding: outputs/hybridR_predictions.json (stored 0%)
  adding: outputs/tagop_answers.json (deflated 54%)
  adding: outputs/tag_op_pred.json (deflated 38%)


# TEST

In [None]:
from transformers import  BertTokenizer
import torch

from utils.file_handling import file_to_list, json_to_dict
from models.questions_classifier import BertClassifier
from preprocessing import tagop_preprocessing

def get_inputs(questions_path, paragraphs_path, table_path):
	ques = file_to_list(questions_path)
	paragraphs = file_to_list(paragraphs_path)
	table = json_to_dict(table_path)
	questions = []
	for i, que in enumerate(ques):
		questions.append({
				"id": f"que_{i}",
				"question": que
		})

	return questions, paragraphs, table


def question_classifier_model(questions, model_path):
	tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
	ques = map(lambda x: x["question"], questions)
 	
	question_classifier = BertClassifier()
	question_classifier.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
 
	question_infs = list(map(lambda question_input: tokenizer(question_input, padding='max_length', max_length = 512, truncation=True, return_tensors="pt"), ques))
	preds = []

	for que_inf in question_infs:
		question_id = que_inf['input_ids']
		question_mask = que_inf['attention_mask'].squeeze(1)
		pred = question_classifier(question_id, question_mask)
		preds.append(pred.argmax(dim=1).numpy()[0])
	
	for i,pred in enumerate(preds):
		questions[i]["category"] = pred

	return questions

In [None]:
!cp /content/FinTabParse/tests/test3/* /content/FinTabParse/inputs


In [None]:
import os

inputs_path = '/content/FinTabParse/inputs/'
paragraphs_path = os.path.join(inputs_path, 'paragraphs.txt')
questions_path = os.path.join(inputs_path, 'questions.txt')
table_path = os.path.join(inputs_path, 'table.json')

questions, paragraphs, table = get_inputs(questions_path, paragraphs_path, table_path)

In [None]:
question_preds = question_classifier_model(questions.copy(), '/content/FinTabParse/weights/questions_classifier')

In [None]:
questions

[{'category': 1,
  'id': 'que_0',
  'question': 'what is the average payment volume per transaction for american express?'},
 {'category': 1,
  'id': 'que_1',
  'question': 'what is the average payment volume per transaction for JCB?'},
 {'category': 1,
  'id': 'que_2',
  'question': 'what is the difference between total volume and payment volume for american express?'},
 {'category': 1,
  'id': 'que_3',
  'question': 'what is the difference between the total volume for JCB and total volume for mastercard?'},
 {'category': 1,
  'id': 'que_4',
  'question': 'what is exclueded from the mastercard figures?'},
 {'category': 1,
  'id': 'que_5',
  'question': 'which companies are the largest operators of open-loop and closed-loop retail electronic payments networks?'}]