# Z1

Desplegar el modelo TAPAS en su versión ya entrenada en WikiTableQuestions y dejarlo preparado para hacer inferencias a mano (una función que reciba una pregunta en string y una tabla y que devuelva la respuesta).


In [1]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd

In [2]:
# Instantiate the TAPAS model and the TAPAS tokenizer
model_name = "google/tapas-base-finetuned-wtq"
model = TapasForQuestionAnswering.from_pretrained(model_name)
tokenizer = TapasTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/490 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/262k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

In [3]:
# given a table and a question, the function returns TAPAS model´s answer
def get_answer(table, question):

  inputs = tokenizer(table=table, queries=question, padding="max_length", return_tensors="pt")

  outputs = model(**inputs)
  predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
      inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
  )

  id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
  aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

  answers = []
  for coordinates in predicted_answer_coordinates:
      if len(coordinates) == 1:
          # only a single cell:
          answers.append(table.iat[coordinates[0]])
      else:
          # multiple cells
          cell_values = []
          for coordinate in coordinates:
              cell_values.append(table.iat[coordinate])
          answers.append(", ".join(cell_values))

  display(table)
  print("")
  for query, answer, predicted_agg in zip(question, answers, aggregation_predictions_string):
      print(query)
      if predicted_agg == "NONE":
          print("Predicted answer: " + answer)
      else:
          print("Predicted answer: " + predicted_agg + " > " + answer)

In [4]:
data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
table = pd.DataFrame.from_dict(data)
question = ["how many movies have the 3 actors done in total?"]
get_answer(table, question)

Unnamed: 0,Actors,Number of movies
0,Brad Pitt,87
1,Leonardo Di Caprio,53
2,George Clooney,69



how many movies have the 3 actors done in total?
Predicted answer: SUM > 87, 53, 69


# Z2

Utilizar el modelo desplegado en Z1  y evaluar sus respuestas sobre el conjunto de datos de Test de WikiTableQuestions. Usar accuracy como métrica de evaluación y exponer los resultados en una tabla junto con los reportados en la publicación original del modelo. Se valorará la eficiencia de la implementación (uso de dataloaders, inferencia en batches etc…)

In [5]:
pip install datasets

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [6]:
from transformers import TapasTokenizer, TapasForQuestionAnswering, DefaultDataCollator, TapasConfig
import pandas as pd
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on {}".format(device))
# Load the test dataset
WTQ_test = load_dataset("wikitablequestions","random-split-1", split="test")

Running on cuda


Downloading data:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11321 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4344 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2831 [00:00<?, ? examples/s]

In [8]:
# Returns the table as a pandas dataframe
def table_to_df(table):
    return pd.DataFrame(table["rows"], columns=table["header"])

In [9]:
# Definition of the TAPAS classifier model y and the forward pass
class TAPASClassifier(nn.Module):
    def __init__(self, params):
        super().__init__()

        self.model_name = params["model_name"]
        self.tokenizer = TapasTokenizer.from_pretrained(self.model_name)
        self.tapas = TapasForQuestionAnswering.from_pretrained(self.model_name)

    def forward(self, batch_x):

        tapas_outputs = self.tapas(input_ids=batch_x["inputs"]["input_ids"].to(device),
                                   attention_mask=batch_x["inputs"]["attention_mask"].to(device),
                                   token_type_ids=batch_x["inputs"]["token_type_ids"].to(device))


        predicted_answer_coordinates, predicted_aggregation_indices = self.tokenizer.convert_logits_to_predictions(
        batch_x["inputs"],
        tapas_outputs.logits.detach().cpu(),
        tapas_outputs.logits_aggregation.detach().cpu(),
        )

        return predicted_answer_coordinates, predicted_aggregation_indices

In [10]:
# Definition of the TAPAS collator in order to build the batches that will be extracted from the dataloader and fed to the model
class TapasCollator(DefaultDataCollator):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, *args, **kwargs):
        new_batch = {"ids": [], "questions": [], "tables": [], "answers": [], "inputs": []}
        for instance in args[0]:
            new_batch["ids"].append(instance["id"])
            new_batch["questions"].append(instance["question"])
            new_batch["tables"].append(instance["table"])
            new_batch["answers"].append(instance["answers"])

            # For some instances, the token_type_ids are larger than the max_vocab_size of the tokenizer so it is necessary to replace them with the max_vocab_size - 1 for those examples
            max_vocab_size = 256
            instance_inputs = self.build_input(instance["table"], instance["question"])
            instance_inputs["token_type_ids"] = torch.where( instance_inputs["token_type_ids"] >= max_vocab_size, max_vocab_size - 1,  instance_inputs["token_type_ids"])

            new_batch["inputs"].append(instance_inputs)

        # merge input tensors
        new_batch["inputs"] = {
            key: torch.cat([item[key] for item in new_batch["inputs"]], dim=0)
            for key in new_batch["inputs"][0]
        }

        return new_batch

    def build_input(self, item_table, question):
        table = table_to_df(item_table)
        return self.tokenizer(table=table, queries=[question], padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)


In [11]:
# Function that checks if all the values in a list are numeric
# It is used to check if the values of a cell are numeric in order to compute the sum or the average
def are_all_numeric(values):
    try:
        float_values = [float(value.replace(",", "")) for value in values]
        return all(isinstance(val, (int, float)) for val in float_values)
    except ValueError:
        return False

# Function that obtains the answer from the predicted coordinates and the predicted aggregation
# It somewhat takes into account the different formats of the numeric values inside the cells
def obtain_model_answer(coordinates, predicted_agg, table):

    cell_values = [table.iat[coordinate] for coordinate in coordinates]

    if predicted_agg == "NONE":
        tapas_answer = ", ".join(cell_values)
    elif predicted_agg in ["SUM", "AVERAGE"]:
        if are_all_numeric(cell_values) and len(cell_values) > 1:
            cell_values_float = [float(value.replace(",", "")) for value in cell_values]
            total = sum(cell_values_float)
            if predicted_agg == "SUM":
                tapas_answer = "{:,.0f}".format(total)
            else:  # AVERAGE
                average = total / len(cell_values_float)
                tapas_answer = "{:,.2f}".format(average)
        else:
            tapas_answer = ", ".join(cell_values)
    elif predicted_agg == "COUNT":
        tapas_answer = str(len(cell_values))

    return tapas_answer

# Function that obtains the real answer
def obtain_real_answer(real_answers):
    if len(real_answers) == 1:
        real_answer = real_answers[0]
    else:
        real_answer = ", ".join(real_answers)

    return real_answer


In [12]:
# Evaluation function that computes the accuracy of the model
def evaluate(model, dataloader):
    model.eval()
    total = 0
    correct = 0
    num_batch = 0
    with torch.no_grad():
      for test_batch in dataloader:
        test_batch["inputs"]["input_ids"] = test_batch["inputs"]["input_ids"].cpu()
        test_batch["inputs"]["attention_mask"] = test_batch["inputs"]["attention_mask"].cpu()
        test_batch["inputs"]["token_type_ids"] = test_batch["inputs"]["token_type_ids"].cpu()

        # Forward pass the whole batch
        predicted_answer_coordinates, predicted_aggregation_indices = model.forward(test_batch)

        for i in range(len(predicted_answer_coordinates)):

            id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
            predicted_agg = id2aggregation[predicted_aggregation_indices[i]]

            coordinates = predicted_answer_coordinates[i]
            table = table_to_df(test_batch["tables"][i])

            tapas_answer = obtain_model_answer(coordinates, predicted_agg, table)
            real_answer = obtain_real_answer(test_batch["answers"][i])

            # Checks the exact match between the predicted answer and the real answer
            if tapas_answer == real_answer:
                correct += 1
            total += 1
            """
            print("-------------------------")
            print("TAPAS answer: ", tapas_answer)
            print("Real answer: ", real_answer)
            print("predicted aggregation: ", predicted_agg)
            print("Correct: ", tapas_answer == real_answer)
            print("-------------------------")
            """
        """
        num_batch += 1
        print("###########################")
        print(f"batch: {num_batch}")
        print(f"correct: {correct}")
        print(f"total: {total}")
        print("accuracy: ", correct/total)
        print("##########################")
        """

    return correct/total

In [13]:
# Instantiate model
tapas = TAPASClassifier({"model_name": "google/tapas-base-finetuned-wtq"})
tapas.to(device)

# Instantiate collator
tapas_collator = TapasCollator(tapas.tokenizer)

# Create DataLoader with collator
test_dataloader = DataLoader(WTQ_test, batch_size=128, shuffle=False, collate_fn=tapas_collator)

In [14]:
accuracy = evaluate(tapas, test_dataloader)
print("Accuracy: ",accuracy)

Accuracy:  0.4631675874769797


|Model|Test Acc|
|--|--|
|TAPAS(trained on WTQ) Herzig et al.|48.8|
|TAPAS(trained on WTQ) Implementado|46.3|

# Script for Z3

A conversion script used to fill in the answer_coordinates, answer_text and float_answer fields

In [15]:
!pip install frozendict



In [16]:
# coding=utf-8
# Copyright 2019 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""This module implements a simple parser that can be used for TAPAS.

Given a table, a question and one or more answer_texts, it will parse the texts
to populate other fields (e.g. answer_coordinates, float_value) that are required
by TAPAS.

Please note that exceptions in this module are concise and not parameterized,
since they are used as counter names in a BEAM pipeline.
"""

import enum
from typing import Callable, List, Text, Optional

import six
import struct
import unicodedata
import re

import frozendict
import numpy as np
import scipy.optimize


class SupervisionMode(enum.Enum):
  # Don't filter out any supervised information.
  NONE = 0
  # Remove all the supervised signals and recompute them by parsing answer
  # texts.
  REMOVE_ALL = 2
  # Same as above but discard ambiguous examples
  # (where an answer matches multiple cells).
  REMOVE_ALL_STRICT = 3


def _find_matching_coordinates(table, answer_text,
                               normalize):
  normalized_text = normalize(answer_text)
  for row_index, row in table.iterrows():
    for column_index, cell in enumerate(row):
      if normalized_text == normalize(str(cell)):
        yield (row_index, column_index)


def _split_thousands(delimiter, value):
  split = value.split(delimiter)
  return len(split) > 1 and any(map(lambda x: len(x) == 3, split))

def _compute_cost_matrix_inner(
    table,
    answer_texts,
    normalize,
    discard_ambiguous_examples,
):
  """Returns a cost matrix M where the value M[i,j] contains a matching cost from answer i to cell j.

  The matrix is a binary matrix and -1 is used to indicate a possible match from
  a given answer_texts to a specific cell table. The cost matrix can then be
  usedto compute the optimal assignments that minimizes the cost using the
  hungarian algorithm (see scipy.optimize.linear_sum_assignment).

  Args:
    table: a Pandas dataframe.
    answer_texts: a list of strings.
    normalize: a function that normalizes a string.
    discard_ambiguous_examples: If true discard if answer has multiple matches.

  Raises:
    ValueError if:
      - we cannot correctly construct the cost matrix or the text-cell
      assignment is ambiguous.
      - we cannot find a matching cell for a given answer_text.

  Returns:
    A numpy matrix with shape (num_answer_texts, num_rows * num_columns).
  """
  max_candidates = 0
  n_rows, n_columns = table.shape[0], table.shape[1]
  num_cells = n_rows * n_columns
  num_candidates = np.zeros((n_rows, n_columns))
  cost_matrix = np.zeros((len(answer_texts), num_cells))

  for index, answer_text in enumerate(answer_texts):
    found = 0
    for row, column in _find_matching_coordinates(table, answer_text,
                                                  normalize):
      found += 1
      cost_matrix[index, (row * len(table.columns)) + column] = -1
      num_candidates[row, column] += 1
      max_candidates = max(max_candidates, num_candidates[row, column])
    if found == 0:
      return None
    if discard_ambiguous_examples and found > 1:
      raise ValueError("Found multiple cells for answers")

  # TODO(piccinno): Shall we allow ambiguous assignments?
  if max_candidates > 1:
    raise ValueError("Assignment is ambiguous")

  return cost_matrix


def _compute_cost_matrix(
    table,
    answer_texts,
    discard_ambiguous_examples,
):
  """Computes cost matrix."""
  for index, normalize_fn in enumerate(STRING_NORMALIZATIONS):
    try:
      result = _compute_cost_matrix_inner(
          table,
          answer_texts,
          normalize_fn,
          discard_ambiguous_examples,
      )
      if result is None:
        continue
      return result
    except ValueError:
      if index == len(STRING_NORMALIZATIONS) - 1:
        raise
  return None


def _parse_answer_coordinates(table,
                              answer_texts,
                              discard_ambiguous_examples):
  """Populates answer_coordinates using answer_texts.

  Args:
    table: a Table message, needed to compute the answer coordinates.
    answer_texts: a list of strings
    discard_ambiguous_examples: If true discard if answer has multiple matches.

  Raises:
    ValueError if the conversion fails.
  """

  cost_matrix = _compute_cost_matrix(
      table,
      answer_texts,
      discard_ambiguous_examples,
  )
  if cost_matrix is None:
    return
  row_indices, column_indices = scipy.optimize.linear_sum_assignment(
      cost_matrix)

  # create answer coordinates as list of tuples
  answer_coordinates = []
  for row_index in row_indices:
    flatten_position = column_indices[row_index]
    row_coordinate = flatten_position // len(table.columns)
    column_coordinate = flatten_position % len(table.columns)
    answer_coordinates.append((row_coordinate, column_coordinate))

  return [answer_coordinates]


### START OF UTILITIES FROM TEXT_UTILS.PY ###

def wtq_normalize(x):
  """Returns the normalized version of x.
  This normalization function is taken from WikiTableQuestions github, hence the
  wtq prefix. For more information, see
  https://github.com/ppasupat/WikiTableQuestions/blob/master/evaluator.py
  Args:
    x: the object (integer type or string) to normalize.
  Returns:
    A normalized string.
  """
  x = x if isinstance(x, six.text_type) else six.text_type(x)
  # Remove diacritics.
  x = "".join(
      c for c in unicodedata.normalize("NFKD", x)
      if unicodedata.category(c) != "Mn")
  # Normalize quotes and dashes.
  x = re.sub(u"[‘’´`]", "'", x)
  x = re.sub(u"[“”]", '"', x)
  x = re.sub(u"[‐‑‒–—−]", "-", x)
  x = re.sub(u"[‐]", "", x)
  while True:
    old_x = x
    # Remove citations.
    x = re.sub(u"((?<!^)\\[[^\\]]*\\]|\\[\\d+\\]|[•♦†‡*#+])*$", "",
               x.strip())
    # Remove details in parenthesis.
    x = re.sub(u"(?<!^)( \\([^)]*\\))*$", "", x.strip())
    # Remove outermost quotation mark.
    x = re.sub(u'^"([^"]*)"$', r"\1", x.strip())
    if x == old_x:
      break
  # Remove final '.'.
  if x and x[-1] == ".":
    x = x[:-1]
  # Collapse whitespaces and convert to lower case.
  x = re.sub(r"\s+", " ", x, flags=re.U).lower().strip()
  x = re.sub("<[^<]+?>", "", x)
  x = x.replace("\n", " ")
  return x


_TOKENIZER = re.compile(r"\w+|[^\w\s]+", re.UNICODE)


def tokenize_string(x):
  return list(_TOKENIZER.findall(x.lower()))


# List of string normalization functions to be applied in order. We go from
# simplest to more complex normalization procedures.
STRING_NORMALIZATIONS = (
    lambda x: x,
    lambda x: x.lower(),
    tokenize_string,
    wtq_normalize,
)


def to_float32(v):
  """If v is a float reduce precision to that of a 32 bit float."""
  if not isinstance(v, float):
    return v
  return struct.unpack("!f", struct.pack("!f", v))[0]


def convert_to_float(value):
  """Converts value to a float using a series of increasingly complex heuristics.
  Args:
    value: object that needs to be converted. Allowed types include
      float/int/strings.
  Returns:
    A float interpretation of value.
  Raises:
    ValueError if the float conversion of value fails.
  """
  if isinstance(value, float):
    return value
  if isinstance(value, int):
    return float(value)
  if not isinstance(value, six.string_types):
    raise ValueError("Argument value is not a string. Can't parse it as float")
  sanitized = value

  try:
    # Example: 1,000.7
    if "." in sanitized and "," in sanitized:
      return float(sanitized.replace(",", ""))
    # 1,000
    if "," in sanitized and _split_thousands(",", sanitized):
      return float(sanitized.replace(",", ""))
    # 5,5556
    if "," in sanitized and sanitized.count(",") == 1 and not _split_thousands(
        ",", sanitized):
      return float(sanitized.replace(",", "."))
    # 0.0.0.1
    if sanitized.count(".") > 1:
      return float(sanitized.replace(".", ""))
    # 0,0,0,1
    if sanitized.count(",") > 1:
      return float(sanitized.replace(",", ""))
    return float(sanitized)
  except ValueError:
    # Avoid adding the sanitized value in the error message.
    raise ValueError("Unable to convert value to float")

### END OF UTILITIES FROM TEXT_UTILS.PY ###

def _parse_answer_float(answer_texts, float_value):
  if len(answer_texts) > 1:
    raise ValueError("Cannot convert to multiple answers to single float")
  float_value = convert_to_float(answer_texts[0])
  float_value = float_value

  return answer_texts, float_value


def _has_single_float_answer_equal_to(question, answer_texts, target):
  """Returns true if the question has a single answer whose value equals to target."""
  if len(answer_texts) != 1:
    return False
  try:
    float_value = convert_to_float(answer_texts[0])
    # In general answer_float is derived by applying the same conver_to_float
    # function at interaction creation time, hence here we use exact match to
    # avoid any false positive.
    return to_float32(float_value) == to_float32(target)
  except ValueError:
    return False


def _parse_question(
    table,
    original_question,
    answer_texts,
    answer_coordinates,
    float_value,
    aggregation_function,
    clear_fields,
    discard_ambiguous_examples,
):
  """Parses question's answer_texts fields to possibly populate additional fields.

  Args:
    table: a Pandas dataframe, needed to compute the answer coordinates.
    original_question: a string.
    answer_texts: a list of strings, serving as the answer to the question.
    anser_coordinates:
    float_value: a float, serves as float value signal.
    aggregation_function:
    clear_fields: A list of strings indicating which fields need to be cleared
      and possibly repopulated.
    discard_ambiguous_examples: If true, discard ambiguous examples.

  Returns:
    A Question message with answer_coordinates or float_value field populated.

  Raises:
    ValueError if we cannot parse correctly the question message.
  """
  question = original_question

  # If we have a float value signal we just copy its string representation to
  # the answer text (if multiple answers texts are present OR the answer text
  # cannot be parsed to float OR the float value is different), after clearing
  # this field.
  if "float_value" in clear_fields and float_value is not None:
    if not _has_single_float_answer_equal_to(question, answer_texts, float_value):
      del answer_texts[:]
      float_value = float(float_value)
      if float_value.is_integer():
        number_str = str(int(float_value))
      else:
        number_str = str(float_value)
      answer_texts = []
      answer_texts.append(number_str)

  if not answer_texts:
    raise ValueError("No answer_texts provided")

  for field_name in clear_fields:
    if field_name == "answer_coordinates":
        answer_coordinates = None
    if field_name == "float_value":
        float_value = None
    if field_name == "aggregation_function":
        aggregation_function = None

  error_message = ""
  if not answer_coordinates:
    try:
      answer_coordinates = _parse_answer_coordinates(
          table,
          answer_texts,
          discard_ambiguous_examples,
      )
    except ValueError as exc:
      error_message += "[answer_coordinates: {}]".format(str(exc))
      if discard_ambiguous_examples:
        raise ValueError(f"Cannot parse answer: {error_message}")

  if not float_value:
    try:
      answer_texts, float_value = _parse_answer_float(answer_texts, float_value)
    except ValueError as exc:
      error_message += "[float_value: {}]".format(str(exc))

  # Raises an exception if we cannot set any of the two fields.
  """
  if not answer_coordinates and not float_value:
    raise ValueError("Cannot parse answer: {}".format(error_message))
  """

  return question, answer_texts, answer_coordinates, float_value, aggregation_function


# TODO(piccinno): Use some sort of introspection here to get the field names of
# the proto.
_CLEAR_FIELDS = frozendict.frozendict({
    SupervisionMode.REMOVE_ALL: [
        "answer_coordinates", "float_value", "aggregation_function"
    ],
    SupervisionMode.REMOVE_ALL_STRICT: [
        "answer_coordinates", "float_value", "aggregation_function"
    ]
})


def parse_question(table, question, answer_texts, answer_coordinates=None, float_value=None, aggregation_function=None,
                    mode=SupervisionMode.REMOVE_ALL):
    """Parses answer_text field of a question to populate additional fields required by TAPAS.

    Args:
        table: a Pandas dataframe, needed to compute the answer coordinates. Note that one should apply .astype(str)
        before supplying the table to this function.
        question: a string.
        answer_texts: a list of strings, containing one or more answer texts that serve as answer to the question.
        answer_coordinates: optional answer coordinates supervision signal, if you already have those.
        float_value: optional float supervision signal, if you already have this.
        aggregation_function: optional aggregation function supervised signal, if you already have this.
        mode: see SupervisionMode enum for more information.

    Returns:
        A list with the question, populated answer_coordinates or float_value.

    Raises:
        ValueError if we cannot parse correctly the question string.
    """
    if mode == SupervisionMode.NONE:
        return question, answer_texts

    clear_fields = _CLEAR_FIELDS.get(mode, None)
    if clear_fields is None:
        raise ValueError(f"Mode {mode.name} is not supported")

    return _parse_question(
        table,
        question,
        answer_texts,
        answer_coordinates,
        float_value,
        aggregation_function,
        clear_fields,
        discard_ambiguous_examples=mode == SupervisionMode.REMOVE_ALL_STRICT,
    )

# Z3

Escoger la versión vanilla (no entrenada con WikiTableQuestions) de uno de los modelos anteriores (se recomienda utilizar el mismo modelo que en Z1 y Z2 aunque no es obligatorio). Realizar un fine-tuning sobre el conjunto de datos de Train de WikiTableQuestions. Realizar la misma evaluación que en Z2 sobre este modelo y comparar resultados.

In [17]:
pip install datasets



In [18]:
pip install transformers



In [19]:
from transformers import TapasTokenizer, TapasForQuestionAnswering, DefaultDataCollator, TapasConfig, AdamW, get_linear_schedule_with_warmup
import pandas as pd
from datasets import load_dataset
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import time
import pdb

In [20]:
# Returns the table as a pandas dataframe
def table_to_df(table):
    return pd.DataFrame(table["rows"], columns=table["header"])

# Function to remove the instances of the trainig set whose answer_coordinates are not in the table
def remove_instance(instance):
    table = table_to_df(instance["table"])
    question = instance["question"]
    answer_text = instance["answers"]

    question, answer_texts, answer_coordinates, float_value, aggregation_function = parse_question(table=table, question=question, answer_texts=answer_text)
    if answer_coordinates is None:
        return False
    else:
        return True

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
# Load the training, validation and test sets
WTQ_train = load_dataset("wikitablequestions","random-split-1", split="train")
WTQ_dev = load_dataset("wikitablequestions","random-split-1", split="validation")
WTQ_test = load_dataset("wikitablequestions","random-split-1", split="test")

In [23]:
# Remove the instances whose answer_coordinates are not in the table
filtered_train=WTQ_train.filter(remove_instance)
filtered_train

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Filter:   0%|          | 0/11321 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'question', 'answers', 'table'],
    num_rows: 7683
})

In [24]:
# Returns the answer_coordinates of the first non-empty cell of the table
def new_tuple(table):
    k = 0
    j = 0
    while k < len(table) and j < len(table.columns):
        if table.iloc[k,j] != "":
            new_tuple = (k,j)
            return new_tuple
        else:
            j += 1
            if j == len(table.columns):
                j = 0
                k += 1
    return (0,0)

# If the rows of one of the answer coordinates is greater than max_rows, then it is replaced by the first non-empty cell
# This is done to avoid errors in the process of tokenization of large tables
# Max_rows is the greatest number of rows so the training process does not crash
def limit_answer_coordinates(table, answer_coordinates, max_rows):

    coordinates = answer_coordinates[0]
    for i in range(len(coordinates)):
        if coordinates[i][0] > max_rows:
            coordinates[i] = new_tuple(table)

    return answer_coordinates


# Function to fill in the float_answer and the answer_coordinates fields of the different datasets
# In case there are no answer_coordinates, the answer_coordinates field is filled with the coordinates of the first non-empty cell
# In case there are answer_coordinates, the answer_coordinates may be limited to a maximum number of rows
def formatDataSet(instance):

    pd_table = table_to_df(instance['table'])

    question, answer_texts, answer_coordinates, float_value, aggregation_function = parse_question(table=pd_table, question=instance['question'], answer_texts=instance['answers'])

    if float_value is None:
        instance["float_answer"] = np.nan
    else:
        instance["float_answer"] = float_value
    if answer_coordinates is None: # Sólo para los ejemplos del conjunto de test ya que para train y dev nunca devolverá None porque hemos filtrado esos ejemplos.
      instance["answer_coordinates"] = [[new_tuple(pd_table)]]
    else:
      instance["answer_coordinates"] = limit_answer_coordinates(pd_table, answer_coordinates, max_rows=20)

    return instance


In [25]:
# Fill in the float_answer and the answer_coordinates fields of the different datasets
train = [formatDataSet(example) for example in filtered_train]
dev = [formatDataSet(example) for example in WTQ_dev]
test = [formatDataSet(example) for example in WTQ_test]

In [26]:
# Function that checks if all the values in a list are numeric
# It is used to check if the values of a cell are numeric in order to compute the sum or the average
def are_all_numeric(values):
    try:
        float_values = [float(value.replace(",", "")) for value in values]
        return all(isinstance(val, (int, float)) for val in float_values)
    except ValueError:
        return False

# Function that obtains the answer from the predicted coordinates and the predicted aggregation
# It somewhat takes into account the different formats of the numeric values inside the cells
def obtain_model_answer(coordinates, predicted_agg, table):

    cell_values = [table.iat[coordinate] for coordinate in coordinates]

    if predicted_agg == "NONE":
        tapas_answer = ", ".join(cell_values)
    elif predicted_agg in ["SUM", "AVERAGE"]:
        if are_all_numeric(cell_values) and len(cell_values) > 1:
            cell_values_float = [float(value.replace(",", "")) for value in cell_values]
            total = sum(cell_values_float)
            if predicted_agg == "SUM":
                tapas_answer = "{:,.0f}".format(total)
            else:  # AVERAGE
                average = total / len(cell_values_float)
                tapas_answer = "{:,.2f}".format(average)
        else:
            tapas_answer = ", ".join(cell_values)
    elif predicted_agg == "COUNT":
        tapas_answer = str(len(cell_values))

    return tapas_answer


# Function that obtains the real answer
def obtain_real_answer(real_answers):
    if len(real_answers) == 1:
        real_answer = real_answers[0]
    else:
        real_answer = ", ".join(real_answers)

    return real_answer


In [27]:
# Evaluation function that computes the accuracy of the model
def evaluate(model, dataloader):
    model.eval()
    total = 0
    correct = 0
    num_batch = 0
    with torch.no_grad():
      for batch in dataloader:
        batch["inputs"]["input_ids"] = batch["inputs"]["input_ids"].cpu()
        batch["inputs"]["attention_mask"] = batch["inputs"]["attention_mask"].cpu()
        batch["inputs"]["token_type_ids"] = batch["inputs"]["token_type_ids"].cpu()
        batch["inputs"]["labels"] = batch["inputs"]["labels"].cpu()
        batch["inputs"]["numeric_values"] = batch["inputs"]["numeric_values"].cpu()
        batch["inputs"]["numeric_values_scale"] = batch["inputs"]["numeric_values_scale"].cpu()
        batch["float_answer"] = batch["float_answer"].cpu()

        # Forward pass the whole batch
        model_outputs = model.forward(batch)

        # Inference of the model
        predicted_answer_coordinates, predicted_aggregation_indices = model.tokenizer.convert_logits_to_predictions(
          batch["inputs"],
          model_outputs.logits.detach().cpu(),
          model_outputs.logits_aggregation.detach().cpu(),
        )

        for i in range(len(predicted_answer_coordinates)):

            id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
            predicted_agg = id2aggregation[predicted_aggregation_indices[i]]

            coordinates = predicted_answer_coordinates[i]
            table = table_to_df(batch["tables"][i])

            tapas_answer = obtain_model_answer(coordinates, predicted_agg, table)
            real_answer = obtain_real_answer(batch["answers"][i])

            if tapas_answer == real_answer:
                correct += 1
            total += 1
            """
            print("-------------------------")
            print("TAPAS answer: ", tapas_answer)
            print("Real answer: ", real_answer)
            print("predicted aggregation: ", predicted_agg)
            print("Correct: ", tapas_answer == real_answer)
            print("-------------------------")
            """

        """
        num_batch += 1
        print("###########################")
        print(f"batch: {num_batch}")
        print(f"correct: {correct}")
        print(f"total: {total}")
        print("accuracy: ", correct/total)
        print("##########################")
        """

    return correct/total

In [28]:
# Definition of the TAPAS classifier model y and the forward pass
class TAPASTrainingClassifier(nn.Module):
    def __init__(self, config, params):
        super().__init__()

        self.model_name = params["model_name"]
        self.tokenizer = TapasTokenizer.from_pretrained(self.model_name)
        self.tapas = TapasForQuestionAnswering.from_pretrained(self.model_name, config=config)

    def forward(self, batch):
        tapas_outputs = self.tapas(input_ids=batch["inputs"]["input_ids"].to(device),
                                   attention_mask=batch["inputs"]["attention_mask"].to(device),
                                   token_type_ids=batch["inputs"]["token_type_ids"].to(device),
                                   labels=batch["inputs"]["labels"].to(device),
                                   numeric_values=batch["inputs"]["numeric_values"].to(device),
                                   numeric_values_scale=batch["inputs"]["numeric_values_scale"].to(device),
                                   float_answer=batch["float_answer"].to(device))

        return tapas_outputs

# Definition of the TAPAS collator in order to build the batches that will be extracted from the dataloader and fed to the model
class TapasTrainingCollator(DefaultDataCollator):

    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, *args, **kwargs):
        new_batch = {"ids": [], "questions": [], "tables": [], "answers": [], "inputs": [], "float_answer": []}
        for instance in args[0]:
            new_batch["ids"].append(instance["id"])
            new_batch["questions"].append(instance["question"])
            new_batch["tables"].append(instance["table"])
            new_batch["answers"].append(instance["answers"])

            # For some instances, the token_type_ids are larger than the max_vocab_size of the tokenizer so it is necessary to replace them with the max_vocab_size - 1 for those examples
            max_vocab_size = 256
            instance_inputs = self.build_input(instance["table"], instance["question"], instance['answer_coordinates'], instance["answers"])
            instance_inputs["token_type_ids"] = torch.where( instance_inputs["token_type_ids"] >= max_vocab_size, max_vocab_size - 1,  instance_inputs["token_type_ids"])

            new_batch["inputs"].append(instance_inputs)
            new_batch["float_answer"].append((torch.tensor(instance["float_answer"]).reshape(1)).to(device))

        # merge input tensors
        new_batch["inputs"] = {
            key: torch.cat([item[key] for item in new_batch["inputs"]], dim=0)
            for key in new_batch["inputs"][0]
        }

        # Merge float_answer tensors
        new_batch["float_answer"] = torch.cat(new_batch["float_answer"], dim=0)

        return new_batch


    def build_input(self, item_table, question, answer_coordinates, answers):
      table = table_to_df(item_table)
      return self.tokenizer(table=table, queries=[question], answer_coordinates=answer_coordinates, answer_text=answers, truncation=True, padding="max_length", max_length=512, return_tensors="pt").to(device)

In [29]:
# Training and evaluation function
def train_and_evaluate(tapas_model_name, config, train, dev, num_epochs, batch_size):

  start_time=time.time()
  # Instantiate the TAPAS model
  tapas = TAPASTrainingClassifier(config=config, params={"model_name": tapas_model_name})
  tapas.to(device)


  # Instantiate the TAPAS train and dev dataloaders
  tapas_train_collator = TapasTrainingCollator(tokenizer=tapas.tokenizer)
  tapas_dev_collator = TapasTrainingCollator(tokenizer=tapas.tokenizer)

  train_dataloader = DataLoader(train, batch_size=batch_size, collate_fn=tapas_train_collator)
  dev_dataloader = DataLoader(dev, batch_size=batch_size, collate_fn=tapas_dev_collator)

  # Learning rate and optimizer
  learning_rate = 5e-5
  optimizer = torch.optim.Adam(tapas.parameters(), lr=learning_rate)

  # Warmup scheduler
  # Ajustar el cálculo de num_training_steps y num_warmup_steps
  warmup_ratio = 0.128960
  num_training_steps = len(train_dataloader) * num_epochs
  num_warmup_steps = int(warmup_ratio * num_training_steps)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

  best_dev_acc = 0.0
  tapas.train()
  for epoch in range(num_epochs):  # loop over the dataset multiple times
      for batch in train_dataloader:
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward
        outputs = tapas.forward(batch)

        # backward
        loss = outputs.loss
        loss.backward()

        # optimize
        optimizer.step()
        scheduler.step()

        print("loss: %3f" % (loss))

      dev_acc = evaluate(tapas, dev_dataloader)
      if epoch % 1 == 0:
        print("Epoch %s, dev accuracy: %.3f" % (epoch, dev_acc))
        if dev_acc > best_dev_acc:
          best_dev_acc = dev_acc
          torch.save(tapas.state_dict(), "tapas_base_bestModel")

  print("\nBest Performing Model achieves dev accuracy of : %.3f" % (best_dev_acc))
  print("Time: %.3f seconds ---" % (time.time() - start_time))

In [30]:
# default WTQ configuration for TAPAS hyperparameters
config = TapasConfig(
    num_aggregation_labels=4,
    use_answer_as_supervision=True,
    answer_loss_cutoff=0.664694,
    cell_selection_preference=0.207951,
    huber_loss_delta=0.121194,
    init_cell_selection_weights_to_zero=True,
    select_one_column=True,
    allow_empty_column_selection=False,
    temperature=0.0352513,
)

In [None]:
# Training the model
train_and_evaluate("google/tapas-base", config, train, dev, num_epochs=6, batch_size=16)

In [31]:
tapas = TAPASTrainingClassifier(config = config, params={"model_name": "google/tapas-base"})

tapas.to(device)
tapas.load_state_dict(torch.load("tapas_base_bestModel"))

tapas_train_collator = TapasTrainingCollator(tokenizer=tapas.tokenizer)
tapas_test_collator = TapasTrainingCollator(tokenizer=tapas.tokenizer)
tapas_dev_collator = TapasTrainingCollator(tokenizer=tapas.tokenizer)

train_dataloader = DataLoader(train, batch_size=16, collate_fn=tapas_train_collator)
test_dataloader = DataLoader(test, batch_size=16, collate_fn=tapas_test_collator)
dev_dataloader = DataLoader(dev, batch_size=16, collate_fn=tapas_dev_collator)

train_accuracy = evaluate(tapas, train_dataloader)
test_accuracy = evaluate(tapas, test_dataloader)
dev_accuracy = evaluate(tapas, dev_dataloader)
print("Train accuracy: ", train_accuracy)
print("Dev accuracy: ", test_accuracy)
print("Test accuracy: ", test_accuracy)

tokenizer_config.json:   0%|          | 0.00/490 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/262k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['output_bias', 'column_output_bias', 'output_weights', 'column_output_weights', 'aggregation_classifier.weight', 'aggregation_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train accuracy:  0.6643238318365222
Dev accuracy:  0.214548802946593
Test accuracy:  0.214548802946593


|Model|Test Acc|
|--|--|
|Base TAPAS(trained on WTQ) Herzig et al.|48.8|
|Base TAPAS(trained on WTQ) Implementado|46.3|
|Base vanilla TAPAS(finetuned on WTQ)|21.4|