In [1]:
import pandas as pd
df=pd.read_csv("/kaggle/input/table-to-text-generation-dataset-google-totto/totto_data/tablesWithTagDev.csv")

In [2]:
len(df)

7700

In [3]:
MAXLENI=400
MAXLENO=200

In [4]:
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration
from transformers import AdamW, WarmUp, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import os
import time
import copy
import numpy
import matplotlib.pyplot as plt

In [5]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model=torch.load("/kaggle/input/table-to-text-generation-utils/T5Epoch 7", map_location=device)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [7]:
special_tokens_dict = {'pad_token': '<pad>', 'bos_token': '<bos>', 'eos_token': '<eos>', 
                       'additional_special_tokens': ['<PAGESTART>', '<PAGEEND>', '<SECTIONSTART>', '<SECTIONEND>',
                                                     '<TABLESTART>','<TABLEEND>','<CELLSTART>','<CELLEND>','<COLHEADERSTART>',
                                                     '<COLHEADEREND>','<ROWHEADERSTART>','<ROWHEADEREND>']}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print('We have added', num_added_toks, 'tokens')
model.encoder.resize_token_embeddings(len(tokenizer))
model.decoder.resize_token_embeddings(len(tokenizer))

We have added 14 tokens


Embedding(32114, 768)

In [8]:
class tottodataset(Dataset):
  def __init__(self,df,tokenizer):
    self.sentence=df['sentence']
    self.table=df['table']
    self.tokenizer=tokenizer

  def __len__(self):
    return len(self.sentence)
  
  def __getitem__(self,idx):
    inp=(self.table[idx]+'</s>').replace("<page_title>", "<PAGESTART>").replace("</page_title>", "<PAGEEND>") \
                                    .replace("<section_title>", "<SECTIONSTART>").replace("</section_title>", "<SECTIONEND>") \
                                    .replace("<table>", "<TABLESTART>").replace("</table>", "<TABLEEND>") \
                                    .replace("<cell>", "<CELLSTART>").replace("</cell>", "<CELLEND>") \
                                    .replace("<col_header>", "<COLHEADERSTART>").replace("</col_header>", "<COLHEADEREND>") \
                                    .replace("<row_header>", "<ROWHEADERSTART>").replace("</row_header>", "<ROWHEADEREND>")
    out=self.sentence[idx]+'</s>'
    inp_tokens=self.tokenizer.encode_plus(inp, padding="max_length", max_length=MAXLENI, truncation=True)
    out_tokens=self.tokenizer.encode_plus(out, padding="max_length", max_length=MAXLENO, truncation=True)
    inp_id=inp_tokens.input_ids
    out_id=out_tokens.input_ids
    inp_mask=inp_tokens.attention_mask
    out_mask=out_tokens.attention_mask
    labels=out_tokens.input_ids.copy()
    labels=[-100  if x==self.tokenizer.pad_token_id else x for x in labels]

    return {
        "table_text":inp,
        "sentence":out,
        "input_ids":torch.tensor(inp_id, dtype=torch.long),
        "input_attention_mask":torch.tensor(inp_mask, dtype=torch.long),
        "decoder_input_ids":torch.tensor(out_id, dtype=torch.long),
        "decoder_attention_mask":torch.tensor(out_mask, dtype=torch.long),
        "labels":torch.tensor(labels, dtype=torch.long)
    }

In [9]:
test_dataset=tottodataset(df,tokenizer)

test_dataloader=DataLoader(test_dataset,
                            batch_size=64,
                            num_workers=2,
                            shuffle=False)

In [10]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32114, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [11]:
!nvidia-smi

Mon Aug 23 15:38:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    32W / 250W |   1981MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [12]:
test_iterator=iter(test_dataloader)

In [13]:
test_batch=next(test_iterator)

In [14]:
generation_output = model.generate(test_batch['input_ids'].to(device), return_dict_in_generate=True, output_scores=True)

In [15]:
generation_output["sequences"]

tensor([[    0,  4173,  7780,  ...,   552,   507,  4581],
        [    0,    86,  5123,  ...,  2129,     6,    11],
        [    0, 12833,   141,  ...,     0,     0,     0],
        ...,
        [    0,  1184,    51,  ...,    52,     7,    10],
        [    0,  3037,  3233,  ...,     6, 12035,     6],
        [    0,    37,  1348,  ...,    16,  6156,     9]], device='cuda:0')

In [16]:
expected=test_batch["sentence"][0]
predicted=tokenizer.decode(generation_output["sequences"][0],skip_special_tokens=True)

In [17]:
print(f"Expected Sentence: {expected}")
print(f"Predicted Sentence: {predicted}")

Expected Sentence: Daniel Henry Chamberlain was the 76th Governor of South Carolina from 1874.</s>
Predicted Sentence: Daniel Henry Chamberlain was the 76th Governor of South Carolina, serving until 1874


In [18]:
!nvidia-smi

Mon Aug 23 15:39:03 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    38W / 250W |   4419MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [19]:
# bleurt_dataloader=DataLoader(test_dataset,
#                             batch_size=1,
#                             num_workers=2,
#                             shuffle=False)

In [20]:
from tqdm.notebook import tqdm

fref = open('./corrects.txt', 'w+')
fout = open('./outputs.txt', 'w+')

output=[]
corrects=[]

for batch in tqdm(test_dataloader):
    input_ids=batch["input_ids"].to(device)
    
    outputs = model.generate(input_ids, return_dict_in_generate=True, output_scores=True)
    for i in range(64):
        try:
            outputstring = tokenizer.decode(outputs["sequences"][i], skip_special_tokens=True)

            refstring=batch["sentence"][i].replace('</s>',"")
            output.append(outputstring)
            corrects.append(refstring)

            fref.write(refstring+'\n')
            fout.write(outputstring+'.\n')
        except:
            pass

fref.close()
fout.close()

  0%|          | 0/121 [00:00<?, ?it/s]

In [21]:
%%bash
git clone https://github.com/google-research/language.git language_repo
cd language_repo
pip3 install -r language/totto/eval_requirements.txt

Collecting sacrebleu
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
Installing collected packages: sacrebleu
Successfully installed sacrebleu-2.0.0


Cloning into 'language_repo'...


In [22]:
!ls

__notebook__.ipynb  corrects.txt  language_repo  outputs.txt


In [23]:
%cd language_repo
!bash language/totto/totto_eval.sh --prediction_path language/totto/sample/output_sample.txt --target_path language/totto/sample/dev_sample.jsonl

/kaggle/working/language_repo
Running with the following variables:
PREDICTION_PATH   : language/totto/sample/output_sample.txt
TARGET_PATH       : language/totto/sample/dev_sample.jsonl 
BLEURT_CKPT       : unset 
OUTPUT_DIR        : temp
MODE              : test
Creating Output directory.
Cloning moses for BLEU script.
Cloning into 'temp/mosesdecoder'...
remote: Enumerating objects: 148070, done.[K
remote: Counting objects: 100% (498/498), done.[K
remote: Compressing objects: 100% (206/206), done.[K
remote: Total 148070 (delta 315), reused 433 (delta 289), pack-reused 147572[K
Receiving objects: 100% (148070/148070), 129.86 MiB | 19.84 MiB/s, done.
Resolving deltas: 100% (114341/114341), done.
Writing references.
Writing tables in PARENT format.
Preparing predictions.
Writing predictions.
Running detokenizers.
Computing BLEU (overall)
{
 "name": "BLEU",
 "score": 45.5,
 "signature": "nrefs:3|case:mixed|eff:no|tok:13a|smooth:exp|version:2.0.0",
 "verbose_s

In [24]:
len(output)

7700

In [25]:
!bash language/totto/totto_eval.sh --prediction_path /kaggle/working/outputs.txt --target_path /kaggle/input/table-to-text-generation-dataset-google-totto/totto_data/totto_dev_data.jsonl --output_dir /kaggle/working/

Running with the following variables:
PREDICTION_PATH   : /kaggle/working/outputs.txt
TARGET_PATH       : /kaggle/input/table-to-text-generation-dataset-google-totto/totto_data/totto_dev_data.jsonl 
BLEURT_CKPT       : unset 
OUTPUT_DIR        : /kaggle/working
MODE              : test
Cloning moses for BLEU script.
Cloning into '/kaggle/working/mosesdecoder'...
remote: Enumerating objects: 148070, done.[K
remote: Counting objects: 100% (498/498), done.[K
remote: Compressing objects: 100% (206/206), done.[K
remote: Total 148070 (delta 315), reused 433 (delta 289), pack-reused 147572[K
Receiving objects: 100% (148070/148070), 129.86 MiB | 22.12 MiB/s, done.
Resolving deltas: 100% (114341/114341), done.
Writing references.
Writing tables in PARENT format.
Preparing predictions.
Writing predictions.
Running detokenizers.
Computing BLEU (overall)
{
 "name": "BLEU",
 "score": 32.6,
 "signature": "nrefs:3|case:mixed|eff:no|tok:13a|smooth:exp|version:2.0.0",
 "verb