# Imports

In [1]:
%load_ext lab_black

In [2]:
import os

os.chdir("/home/ivanr/git/document_information_extraction/")

In [23]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    BertTokenizerFast as BertTokenizer,
    BertModel,
    AdamW,
    get_linear_schedule_with_warmup,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)

import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42

sns.set(style="whitegrid", palette="muted", font_scale=1.2)
HAPPY_COLORS_PALETTE = [
    "#01BEFE",
    "#FFDD00",
    "#FF7D00",
    "#FF006D",
    "#ADFF02",
    "#8F00FF",
]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams["figure.figsize"] = 12, 8

pl.seed_everything(RANDOM_SEED)

Global seed set to 42


42

# Statics

In [20]:
BART_MODEL_NAME = "facebook/bart-large-cnn"
BERT_MODEL_NAME = "bert-base-cased"

TORCH_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: ", TORCH_DEVICE)
language = "english"

Device:  cuda


# Load model

In [21]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True).to(
    TORCH_DEVICE
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
tokenizer_bart = AutoTokenizer.from_pretrained(BART_MODEL_NAME)
bart_model = AutoModelForSeq2SeqLM.from_pretrained(BART_MODEL_NAME).to(TORCH_DEVICE)

In [26]:
print(bart_model.config)
bart_encoder_max_length = 1024
decoder_max_length = 64

BartConfig {
  "_name_or_path": "facebook/bart-large-cnn",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "L

In [27]:
print(bert_model.config)
encoder_max_length = 512
decoder_max_length = 64

BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.6.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



# Load data

In [28]:
from src.data.data_statics import (
    MIN_SEMANTIC_SIMILARITY,
    MIN_NOVELTY,
    MAX_NOVELTY,
    MAX_TOKENS_BODY,
)
import pandas as pd
from src.data.wikipedia.wiki_data_base import (
    retrieve_query,
    retrive_observations_from_ids,
)

RANDOM_SEED = 0
N_SAMPLE_TEXTS = 1000

QUERY_SUITABLE_ARTICLES = f"""
SELECT ar.*,
       nv.novelty_tokens,
       nv.novelty_bigrams,
       nv.novelty_trigrams,
       cs.semantic_similarity
       
FROM article_level_info ar
INNER JOIN wiki_article_novelty nv
    ON ar.pageid = nv.pageid
INNER JOIN wiki_article_cosine_similarity cs
    ON ar.pageid = cs.pageid
WHERE cs.semantic_similarity>={MIN_SEMANTIC_SIMILARITY}
    AND nv.novelty_tokens<={MAX_NOVELTY}
    AND nv.novelty_tokens>={MIN_NOVELTY}
    AND ar.body_word_count<={MAX_TOKENS_BODY}
"""
import pickle


characterisation_df = pd.DataFrame(
    retrieve_query(QUERY_SUITABLE_ARTICLES),
    columns=[
        "pageid",
        "title",
        "summary_word_count",
        "body_word_count",
        "novelty_tokens",
        "novelty_bigrams",
        "novelty_trigrams",
        "semantic_similarity",
    ],
)


pageids_to_evaluate = list(
    characterisation_df["pageid"].sample(n=N_SAMPLE_TEXTS, random_state=RANDOM_SEED)
)

ARTICLE_GENERATOR = retrive_observations_from_ids(pageids_to_evaluate)
ARTICLE_GENERATOR_TRAIN = ARTICLE_GENERATOR[:900]
ARTICLE_GENERATOR_TEST = ARTICLE_GENERATOR[998:]


def decode_row(article):
    summary = article[2]
    body = "".join(pickle.loads(article[3]))
    return summary, body


def convert_to_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)

    input_ = []
    target_ = []
    for article in example_batch:
        summary, body = decode_row(article)
        input_.append(body)
        target_.append(summary)

    return {"document": input_, "summary": target_}

In [29]:
train_data = convert_to_features(ARTICLE_GENERATOR_TRAIN)
validation_data = convert_to_features(ARTICLE_GENERATOR_TEST)

In [30]:
validation_data["summary"]

[' \n\nVikram Sampath is an Indian columnist and author of four books.\n\nHe has authored Savarkar: Echoes From A Forgotten Past (Part 1 and 2) and My Name Is Gauhar Jaan: The Life and Times of a Musician. He is a columnist for The Print.\n\n',
 ' William Roe, DL (1748 – 6 March 1826) was an English civil servant. He was a Commissioner for Auditing Public Accounts from 1783 to 1788, and then a Commissioner for Customs until 1819; he was chairman of the Board of Customs for England and Wales from 1805 to 1819.\n\n']

In [19]:
encoding = tokenizer.encode_plus(
    validation_data["summary"],
    add_special_tokens=True,
    max_length=512,
    return_token_type_ids=False,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt",
)

encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [33]:
encoding["input_ids"].shape, encoding["attention_mask"].shape

(torch.Size([1, 512]), torch.Size([1, 512]))

In [32]:
encoding_bart = tokenizer_bart.encode_plus(
    validation_data["summary"],
    add_special_tokens=True,
    max_length=1024,
    return_token_type_ids=False,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt",
)

encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [34]:
encoding_bart["input_ids"].shape, encoding_bart["attention_mask"].shape

(torch.Size([1, 1024]), torch.Size([1, 1024]))

# Metrics

In [42]:
# nltk.download("punkt", quiet=True)

# metric = datasets.load_metric("rouge")


# def postprocess_text(preds, labels):
#     preds = [pred.strip() for pred in preds]
#     labels = [label.strip() for label in labels]

#     # rougeLSum expects newline after each sentence
#     preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
#     labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

#     return preds, labels


# def compute_metrics(eval_preds):
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     # Replace -100 in the labels as we can't decode them.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # Some simple post-processing
#     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

#     result = metric.compute(
#         predictions=decoded_preds, references=decoded_labels, use_stemmer=True
#     )
#     # Extract a few results from ROUGE
#     result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

#     prediction_lens = [
#         np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
#     ]
#     result["gen_len"] = np.mean(prediction_lens)
#     result = {k: round(v, 4) for k, v in result.items()}
#     return result

# Training

In [35]:
class ToxicCommentsDataset(Dataset):
    def __init__(
        self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_len: int = 128
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        comment_text = data_row.comment_text
        labels = data_row[LABEL_COLUMNS]

        encoding = self.tokenizer.encode_plus(
            comment_text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return dict(
            comment_text=comment_text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.FloatTensor(labels),
        )

In [12]:
trainer.train()

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ivanr/.netrc


KeyError: 2