In [1]:
import numpy as np
import pandas as pd

In [None]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
data = pd.read_csv(
    "gs://data-reasearch/data.csv", encoding=DATASET_ENCODING, names=DATASET_COLUMNS
)

In [None]:
data.target = data.target.replace(4, 1)
data.text.apply(lambda x: len(x.split())).sum()

In [None]:
from datasets import ClassLabel, Dataset

data_set = Dataset.from_pandas(data)
data_set = data_set.cast_column("target", ClassLabel(num_classes=2, names=[0, 1]))
data_set = data_set.train_test_split(
    test_size=0.2, stratify_by_column="target", seed=1234
)

In [25]:
data_set["test"].to_csv("test.csv")

Creating CSV from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

45968943

In [None]:
data.head()

In [5]:
data.head().text.values

array(["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
       "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
       '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
       'my whole body feels itchy and like its on fire ',
       "@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "],
      dtype=object)

In [6]:
from collections import Counter

Counter(" ".join(data.text.to_list()).split()).most_common(50)

[('to', 552962),
 ('I', 496619),
 ('the', 487501),
 ('a', 366212),
 ('my', 280025),
 ('and', 275263),
 ('i', 250016),
 ('is', 217693),
 ('you', 213871),
 ('for', 209801),
 ('in', 202294),
 ('of', 179554),
 ('it', 171812),
 ('on', 154365),
 ('have', 132249),
 ('so', 125155),
 ('me', 122509),
 ('that', 118685),
 ('with', 110843),
 ('be', 108069),
 ('but', 106272),
 ('at', 102196),
 ("I'm", 99559),
 ('was', 99140),
 ('just', 96284),
 ('not', 88110),
 ('this', 77810),
 ('get', 76734),
 ('like', 73302),
 ('are', 72568),
 ('up', 70007),
 ('all', 67901),
 ('-', 67079),
 ('out', 67030),
 ('go', 62969),
 ('your', 60854),
 ('good', 59775),
 ('day', 55748),
 ('do', 54628),
 ('from', 54182),
 ('got', 53871),
 ('now', 53591),
 ('going', 53236),
 ('love', 50051),
 ('no', 49622),
 ('about', 46708),
 ('work', 45913),
 ('will', 45898),
 ('back', 44033),
 ('u', 43568)]

# Tokenizer

In [165]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers

tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))
tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.NFD(),
        normalizers.StripAccents(),
        normalizers.Strip(),
        normalizers.Lowercase(),
        # normalizers.Replace(),
    ]
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.Whitespace(), pre_tokenizers.Punctuation(), pre_tokenizers.Digits()]
)
trainer = trainers.WordLevelTrainer(
    vocab_size=10000, show_progress=True, special_tokens=["<unk>", "<pad>"]
)
tokenizer.train_from_iterator(data.text.to_list(), trainer=trainer)
tokenizer.enable_padding(pad_token="<pad>", pad_id=tokenizer.token_to_id("<pad>"))
tokenizer.enable_truncation(max_length=64)

In [None]:
tokenizer.get_vocab()

In [38]:
tokenizer.encode("Hello how are you r u fine").ids

[0, 0, 0, 0, 0]

In [39]:
tokenizer.padding

{'length': None,
 'pad_to_multiple_of': None,
 'pad_id': 1,
 'pad_token': '<pad>',
 'pad_type_id': 0,
 'direction': 'right'}

In [46]:
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer, max_len=5, pad_token="<pad>"
)
fast_tokenizer(data, truncation=True, padding=True)

{'input_ids': [[0, 0, 1, 1, 1], [6, 0, 0, 1, 1], [0, 0, 5, 7, 4], [0, 8, 0, 0, 0], [0, 0, 0, 9, 2]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [9]:
tokenizer.padding

# DataLoader

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset


class TweetDataSet(Dataset):
    def __init__(self, data_dir: str = "./"):
        super(TweetDataSet, self).__init__()
        self.data_dir = data_dir
        self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
            ]
        )

        self.dims = (3, 32, 32)
        self.num_classes = 10

    def prepare_data(self):
        # download
        CIFAR10(self.data_dir, train=True, download=True)
        CIFAR10(self.data_dir, train=False, download=True)

    def setup(self, stage=None):

        # Assign train/val datasets for use in dataloaders
        if stage == "fit" or stage is None:
            cifar_full = CIFAR10(self.data_dir, train=True, transform=self.transform)
            self.cifar_train, self.cifar_val = random_split(cifar_full, [45000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == "test" or stage is None:
            self.cifar_test = CIFAR10(
                self.data_dir, train=False, transform=self.transform
            )

    def train_dataloader(self):
        return DataLoader(self.cifar_train, batch_size=BATCH_SIZE)

    def val_dataloader(self):
        return DataLoader(self.cifar_val, batch_size=BATCH_SIZE)

    def test_dataloader(self):
        return DataLoader(self.cifar_test, batch_size=BATCH_SIZE)


def collator(batch, tokenizer):
    text = [item for item in batch]
    out_tokenizer = fast_tokenizer(text, truncation=True, padding=True)
    input_ids = torch.LongTensor(out_tokenizer["input_ids"])
    attention_mask = torch.LongTensor(out_tokenizer["attention_mask"])
    return out_tokenizer


next(
    iter(
        DataLoader(
            data,
            batch_size=2,
            shuffle=True,
            collate_fn=lambda x: collator(x, tokenizer),
        )
    )
)

# Model

In [3]:
import torch
import torch.nn as nn
from pytorch_lightning.core.module import LightningModule
from torchmetrics import Accuracy
from transformers import AdamW, AutoModel, get_linear_schedule_with_warmup


class TweetCatModel(LightningModule):
    def __init__(
        self, hidden_dim, dropout_clf, output_dim, learning_rate, max_epochs, lm_path
    ):
        super(TweetCatModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout_clf = dropout_clf
        self.output_dim = output_dim
        self.loss_fn = nn.CrossEntropyLoss()
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.lm = AutoModel.from_pretrained(lm_path)
        lm_output = self.lm.pooler.dense.out_features

        self.classifier = nn.Sequential(
            nn.Linear(lm_output, hidden_dim),
            nn.Dropout(dropout_clf),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
        )
        # Define metrics used during training
        self.metric = Accuracy()
        self.eval_metric = Accuracy()
        self.test_metric = Accuracy()

    def forward(self, input_ids, attention_mask):
        lm_output = self.lm(input_ids, attention_mask).pooler_output
        logits = self.classifier(lm_output)
        return logits

    def compute_loss(self, logits, labels):
        return self.loss_fn(logits, labels)

    def configure_optimizers(self):
        tagger_params = filter(lambda p: p.requires_grad, self.parameters())
        optimizer = AdamW(tagger_params, lr=self.learning_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(self.max_epochs / 10),
            num_training_steps=self.max_epochs,
        )
        return [optimizer], [scheduler]

    def decode(self, scores):
        # scores (float) [batch_size, output_dim]
        res = torch.argmax(scores, axis=1)
        return res

    def training_step(self, batch, batch_idx):
        inputs_ids, attenstion_mask, label_ids = batch
        logits = self(inputs_ids, attenstion_mask)
        loss = self.compute_loss(logits, label_ids)
        preds = self.decode(logits)
        self.metric(preds, label_ids)
        # Log metrics into logger
        self.log("train_loss", loss, on_epoch=True, on_step=False)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs_ids, attenstion_mask, label_ids = batch
        logits = self(inputs_ids, attenstion_mask)
        loss = self.compute_loss(logits, label_ids)
        preds = self.decode(logits)
        self.eval_metric(preds, label_ids)
        # Log metrics into logger
        self.log("val_loss", loss, on_epoch=True, on_step=False)
        return loss

    def validation_epoch_end(self, outs):
        self.log("val_acc", self.eval_metric.compute())

    def test_step(self, batch, batch_idx):
        inputs_ids, attenstion_mask, label_ids = batch
        logits = self(inputs_ids, attenstion_mask)
        loss = self.compute_loss(logits, label_ids)
        preds = self.decode(logits)
        self.test_metric(preds, label_ids)
        # Log metrics into logger
        self.log("test_loss", loss, on_epoch=True, on_step=False)
        return loss

In [4]:
import os

import yaml


def load_checkpoint_model(path, model_name):
    path_params = os.path.join(path, "hparams.yaml")
    path_model = os.path.join(path, model_name)
    with open(path_params) as file:
        model_params = yaml.load(file, Loader=yaml.FullLoader)

    return TweetCatModel.load_from_checkpoint(
        checkpoint_path=path_model, hparams_file=path_params
    )


retrained_model = load_checkpoint_model("./model", "epoch=2-step=34000.ckpt")
retrained_model.eval()
retrained_model.zero_grad()
#

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment were not used when initializing XLMRobertaModel: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to 

In [None]:
torch.nn.Softmax(dim=1)(model(torch.randn(100, 96)))

In [6]:
import numpy as np
from scipy.special import softmax
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)


# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = "@user" if t.startswith("@") and len(t) > 1 else t
        t = "http" if t.startswith("http") else t
        new_text.append(t)
    return " ".join(new_text)


MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL, padding="max_length")
config = AutoConfig.from_pretrained(MODEL)

# PT
#model = AutoModelForSequenceClassification.from_pretrained(MODEL)
"""
model.save_pretrained(MODEL)

text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)"""

'\nmodel.save_pretrained(MODEL)\n\ntext = "Good night 😊"\ntext = preprocess(text)\nencoded_input = tokenizer(text, return_tensors=\'pt\')\noutput = model(**encoded_input)\nscores = output[0][0].detach().numpy()\nscores = softmax(scores)'

In [17]:
retrained_model(
    **tokenizer(
        ["I hate you", "I love you"],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128,
    )
).argmax(dim=1)

tensor([0, 1])

# Explain

In [93]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
from captum.attr import (
    InternalInfluence,
    LayerActivation,
    LayerConductance,
    LayerDeepLift,
    LayerDeepLiftShap,
    LayerFeatureAblation,
    LayerGradCam,
    LayerGradientShap,
    LayerGradientXActivation,
    LayerIntegratedGradients,
    LayerLRP,
    TokenReferenceBase,
)
from captum.attr import visualization as viz

PAD_IND = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerGradientXActivation(
    retrained_model, retrained_model.lm.embeddings, multiply_by_inputs=True)


def get_tokens_from_offsets(text, offsets):
    return [text[start:end] for start, end in offsets]


sentence_list = [
    "It was a fantastic performance !",
    "This is a bad movie",
    "I wake up late",
    "This guy is not nice",
    "This woman is beautiful"
]
target = torch.LongTensor([1, 0, 0, 0, 1])
text = [preprocess(s) for s in sentence_list]
tokenized = tokenizer(text, truncation=True, padding=True, return_offsets_mapping=True)
input_indices = torch.LongTensor(tokenized.input_ids)
input_emb = retrained_model.lm.embeddings(input_indices)
attention_mask = torch.LongTensor(tokenized.attention_mask)
tokens = list(map(get_tokens_from_offsets, text, tokenized.offset_mapping))
# predict
pred = retrained_model(input_indices, attention_mask)
pred_prob = torch.softmax(pred, dim=1)
pred_ind = pred.argmax(dim=1)
print(pred_ind)

# generate reference indices for each sample
reference_indices = torch.full(size=(input_indices.shape), fill_value=PAD_IND)
# compute attributions and approximation delta using layer integrated gradients*
print(target)
attributions = lig.attribute(
    inputs=input_indices,
    #baselines=reference_indices,
    target=target,
    additional_forward_args=attention_mask,
    #return_convergence_delta=True
    # verbose=True
)
attributions = attributions.sum(dim=-1)
attributions = attributions / torch.norm(attributions, dim=1).unsqueeze(1)
vis = []

for i in range(len(sentence_list)):
    vis.append(
        viz.VisualizationDataRecord(
            word_attributions=attributions[i],
            pred_prob=pred_prob[i].max(),
            pred_class=pred_ind[i],
            true_class=target[i],
            attr_class=str(target[i].item()),
            attr_score=attributions.sum(dim=1)[i],
            raw_input_ids=tokens[i],
            convergence_score=delta[i],
        )
    )
print("Visualize attributions based on Gradients X Input")
_ = viz.visualize_text(vis)

tensor([1, 0, 0, 0, 1])
tensor([1, 0, 0, 0, 1])
Visualize attributions based on Gradients X Input


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (0.99),1.0,-1.27,It was a fantastic performance !
,,,,
0.0,0 (0.98),0.0,-2.01,This is a bad movie
,,,,
0.0,0 (0.79),0.0,-1.7,I wake up late
,,,,
0.0,0 (0.99),0.0,-1.48,This guy is not nice
,,,,
1.0,1 (0.98),1.0,-1.65,This woman is beautiful
,,,,


In [43]:
from tqdm.notebook import tqdm


def chunks(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i : i + chunk_size]


def predict(data, model, tokenizer, chunk_size=64, return_confidence=False):
    len_data = len(data)
    data_chunks = chunks(list(data.text), chunk_size)
    res = []
    for chunk in tqdm(data_chunks, total=int(len_data / chunk_size)):
        text = [preprocess(example) for example in chunk]
        out_tokenizer = tokenizer(text, truncation=True, padding=True)
        input_ids = torch.LongTensor(out_tokenizer["input_ids"])
        attention_mask = torch.LongTensor(out_tokenizer["attention_mask"])
        scores = model(input_ids, attention_mask).argmax(dim=1)
        res += scores
    return res

In [None]:
test = pd.read_csv("test.csv")
predict(test, retrained_model, tokenizer)

In [None]:
# encoded_input = tokenizer(["I hate you", "I love you"], return_tensors='pt', truncation=True, padding=True, max_length=128)
model(
    **tokenizer(
        ["I hate you", "I love you"],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128,
    )
).logits[:, [0, 2]]

In [292]:
from tqdm.notebook import tqdm

tqdm.pandas()
data.text = data.text.progress_apply(preprocess)

  0%|          | 0/1600000 [00:00<?, ?it/s]

In [304]:
encoded_input = tokenizer(
    data.text.to_list(),
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=128,
)

In [None]:
model(
    input_ids=encoded_input["input_ids"][:10, :],
    attention_mask=encoded_input["attention_mask"][:2, :],
).logits[:, [0, 2]]

In [272]:
from datasets import ClassLabel, Dataset

model(
    next(
        iter(
            DataLoader(Dataset.from_pandas(data.head()), batch_size=256, shuffle=False)
        )
    )
)

{'target': tensor([0, 0, 0, 0, 0]),
 'ids': tensor([1467810369, 1467810672, 1467810917, 1467811184, 1467811193]),
 'date': ['Mon Apr 06 22:19:45 PDT 2009',
  'Mon Apr 06 22:19:49 PDT 2009',
  'Mon Apr 06 22:19:53 PDT 2009',
  'Mon Apr 06 22:19:57 PDT 2009',
  'Mon Apr 06 22:19:57 PDT 2009'],
 'flag': ['NO_QUERY', 'NO_QUERY', 'NO_QUERY', 'NO_QUERY', 'NO_QUERY'],
 'user': ['_TheSpecialOne_', 'scotthamilton', 'mattycus', 'ElleCTF', 'Karoli'],
 'text': ["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
  "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
  '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
  'my whole body feels itchy and like its on fire ',
  "@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "]}