In [1]:
import tempfile
from typing import Dict, Iterable, List, Tuple

import allennlp
import torch
from allennlp.data import DataLoader, DatasetReader, Instance, Vocabulary
from allennlp.data.fields import LabelField, TextField
from allennlp.data.token_indexers import TokenIndexer, PretrainedTransformerIndexer
from allennlp.data.tokenizers import Token, Tokenizer, PretrainedTransformerTokenizer
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder, Seq2VecEncoder
from allennlp.modules.token_embedders import PretrainedTransformerEmbedder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.seq2vec_encoders import BertPooler
from allennlp.nn import util
from allennlp.training.trainer import GradientDescentTrainer, Trainer
from allennlp.training.optimizers import AdamOptimizer
from allennlp.training.metrics import CategoricalAccuracy

from os.path import join as pathjoin
import pandas as pd
from allennlp.predictors import TextClassifierPredictor
from allennlp.training.metrics import CategoricalAccuracy
import numpy as np

In [2]:
DATA_DIR = '/home/mlepekhin/data'
MODELS_DIR = '/home/mlepekhin/models'
MODEL_ID = 'allennlp_rubert'
CHECKPOINTS_DIR = pathjoin(MODELS_DIR, MODEL_ID, 'checkpoints')
BEST_MODEL = pathjoin(CHECKPOINTS_DIR, 'best.th')

In [3]:
transformer_model = 'DeepPavlov/rubert-base-cased'
MAX_TOKENS = 510

In [4]:
class SimpleClassifier(Model):
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder):
        super().__init__(vocab)
        self.embedder = embedder 
        num_labels = vocab.get_vocab_size("labels")
        self.encoder = encoder
        self.classifier = torch.nn.Linear(encoder.get_output_dim(), num_labels)
        self.accuracy = CategoricalAccuracy()
        

    def forward(self,
                text: Dict[str, torch.Tensor],
                label: torch.Tensor=None) -> Dict[str, torch.Tensor]:
        # Shape: (batch_size, num_tokens, embedding_dim)
        embedded_text = self.embedder(text)
        #print("embed shape", embedded_text.shape)
        # Shape: (batch_size, num_tokens)
        mask = util.get_text_field_mask(text)
        #print("mask shape", mask.shape)
        # Shape: (batch_size, encoding_dim)
        encoded_text = self.encoder(embedded_text, mask)
        # Shape: (batch_size, num_labels)
        logits = self.classifier(encoded_text)
        # Shape: (batch_size, num_labels)
        probs = torch.nn.functional.softmax(logits, dim=-1)
        if label is not None:
            loss = torch.nn.functional.cross_entropy(logits, label)
            self.accuracy(logits, label)
            return {'loss': loss, 'probs': probs}
        else:
            return {'probs': probs}
    
    def get_metrics(self, reset: bool = True) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}


def read_data(reader: DatasetReader, train_path: str, val_path: str) -> Tuple[Iterable[Instance], Iterable[Instance]]:
    print("Reading data")
    training_data = reader.read(train_path)
    validation_data = reader.read(val_path)
    return training_data, validation_data


def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
    print("Building the vocabulary")
    return Vocabulary.from_instances(instances)


def build_model(vocab: Vocabulary) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    #embedder = BasicTextFieldEmbedder(
    #    {"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)})
    embedding = PretrainedTransformerEmbedder(model_name=transformer_model)
    embedder = BasicTextFieldEmbedder(token_embedders={'bert_tokens': embedding})
    encoder = BertPooler(transformer_model)
    return SimpleClassifier(vocab, embedder, encoder)

def build_dataset_reader() -> DatasetReader:
    return ClassificationDatasetReader()

def run_training_loop():
    dataset_reader = build_dataset_reader()

    # These are a subclass of pytorch Datasets, with some allennlp-specific
    # functionality added.
    train_data, dev_data = read_data(dataset_reader)

    vocab = build_vocab(train_data + dev_data)
    model = build_model(vocab)

    # This is the allennlp-specific functionality in the Dataset object;
    # we need to be able convert strings in the data to integers, and this
    # is how we do it.
    train_data.index_with(vocab)
    dev_data.index_with(vocab)

    # These are again a subclass of pytorch DataLoaders, with an
    # allennlp-specific collate function, that runs our indexing and
    # batching code.
    train_loader, dev_loader = build_data_loaders(train_data, dev_data)

    # You obviously won't want to create a temporary file for your training
    # results, but for execution in binder for this course, we need to do this.
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = build_trainer(
            model,
            serialization_dir,
            train_loader,
            dev_loader
        )
        print("Starting training")
        trainer.train()
        print("Finished training")
    return trainer


# The other `build_*` methods are things we've seen before, so they are
# in the setup section above.
def build_data_loaders(
    train_data: torch.utils.data.Dataset,
    dev_data: torch.utils.data.Dataset,
) -> Tuple[allennlp.data.DataLoader, allennlp.data.DataLoader]:
    # Note that DataLoader is imported from allennlp above, *not* torch.
    # We need to get the allennlp-specific collate function, which is
    # what actually does indexing and batching.
    train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
    dev_loader = DataLoader(dev_data, batch_size=16, shuffle=False)
    return train_loader, dev_loader


def build_trainer(
    model: Model,
    serialization_dir: str,
    train_loader: DataLoader,
    dev_loader: DataLoader,
    num_epochs: int = 1,
    cuda_device: int = -1
) -> Trainer:
    parameters = [
        [n, p]
        for n, p in model.named_parameters() if p.requires_grad
    ]
    optimizer = AdamOptimizer(parameters, lr=0.000025)
    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=num_epochs,
        optimizer=optimizer,
        cuda_device=cuda_device,
    )
    return trainer

In [5]:
tokenizer = PretrainedTransformerTokenizer(model_name=transformer_model)
token_indexer = PretrainedTransformerIndexer(model_name=transformer_model)
text = "Hello! I's no more than just my opinion. But I wanted to say you that I don't care"
tokens = tokenizer.tokenize(text)
print("Transformer tokens:", tokens)

Transformer tokens: [[CLS], hel, ##lo, !, i, ', s, no, more, than, just, my, op, ##ini, ##on, ., but, i, want, ##ed, to, say, you, that, i, don, ', t, car, ##e, [SEP]]


In [6]:
from allennlp.interpret.saliency_interpreters import SmoothGradient

In [7]:
class ClassificationDatasetReader(DatasetReader):
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_tokens: int = None):
        super().__init__(lazy)
        self.tokenizer = tokenizer or PretrainedTransformerTokenizer(transformer_model, max_length=MAX_TOKENS)
        self.token_indexers = token_indexers or {'bert_tokens': PretrainedTransformerIndexer(transformer_model)}
        self.max_tokens = max_tokens
        
    def text_to_instance(self, string: str, label: str = None) -> Instance:
        tokens = self.tokenizer.tokenize(string)
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"text": sentence_field}
        if label is not None:
            fields["label"] = LabelField(label)
        return Instance(fields)

    def _read(self, file_path: str) -> Iterable[Instance]:
        dataset_df = pd.read_csv(file_path)
        for text, label in zip(dataset_df['text'], dataset_df['target']):
            yield self.text_to_instance(text, label)

In [8]:
class SimpleClassifier(Model):
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder):
        super().__init__(vocab)
        self.embedder = embedder 
        #num_labels = vocab.get_vocab_size("labels")
        num_labels=10
        self.encoder = encoder
        self.classifier = torch.nn.Linear(encoder.get_output_dim(), num_labels)
        print(encoder.get_output_dim(), num_labels)
        self.accuracy = CategoricalAccuracy()
        

    def forward(self,
                text: Dict[str, torch.Tensor],
                label: torch.Tensor=None) -> Dict[str, torch.Tensor]:
        # Shape: (batch_size, num_tokens, embedding_dim)
        embedded_text = self.embedder(text)
        #print("embed shape", embedded_text.shape)
        # Shape: (batch_size, num_tokens)
        mask = util.get_text_field_mask(text)
        #print("mask shape", mask.shape)
        # Shape: (batch_size, encoding_dim)
        encoded_text = self.encoder(embedded_text, mask)
        # Shape: (batch_size, num_labels)
        logits = self.classifier(encoded_text)
        # Shape: (batch_size, num_labels)
        probs = torch.nn.functional.softmax(logits, dim=-1)
        if label is not None:
            loss = torch.nn.functional.cross_entropy(logits, label)
            self.accuracy(logits, label)
            return {'loss': loss, 'probs': probs}
        else:
            return {'probs': probs}
    
    def get_metrics(self, reset: bool = True) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

### Loading of the pretrained model

In [9]:
vocab = Vocabulary()
vocab.from_files(pathjoin(MODELS_DIR, MODEL_ID, 'vocab'))

Vocabulary with namespaces:  tags, Size: 119547 || labels, Size: 10 || Non Padded Namespaces: {'*labels', '*tags'}

In [31]:
print(vocab.get_token_to_index_vocabulary('labels'))

{}


In [11]:
if torch.cuda.is_available():
    cuda_device = 0
else:
    cuda_device = -1
print(cuda_device)

0


In [12]:
model = build_model(vocab).cuda(cuda_device)

Building the model
768 10


In [13]:
model.load_state_dict(torch.load(BEST_MODEL, map_location=f'cuda:{cuda_device}'))

<All keys matched successfully>

### Data loading

In [14]:
ru_test_df = pd.read_csv(pathjoin(DATA_DIR, 'ru_test'))
ru_test_df.head()

Unnamed: 0.1,Unnamed: 0,target,text
0,726,A7,Глава 1 Приступая к работе 1.1 Знакомство с те...
1,1871,A17,Kawasaki D-Tracker С недавних пор Kawasaki d-t...
2,1265,A17,"По моему , вполне достойные книги , может и не..."
3,205,A11,Тест-драйв Lada Granta : новая надежда автогра...
4,141,A8,"среда , 2 декабря 2009 года , 12.33 Бумага всё..."


In [16]:
true_classes = ru_test_df.target.values
sentences = ru_test_df.text.values

### Preliminary analysis

In [32]:
label_description = {
    'A1': 'argum',
    'A3': 'emotive',
    'A4': 'fictive',
    'A5': 'flippant',
    'A6': 'informal',
    'A7': 'instruct',
    'A8': 'reporting',
    'A9': 'legal',
    'A11': 'personal',
    'A12': 'commercial',
    'A13': 'propaganda',
    'A14': 'research',
    'A15': 'specialist',
    'A16': 'info',
    'A17': 'eval',
    'A19': 'poetic',
    'A20': 'appeal',
    'A22': 'stuff'
}

In [18]:
predictor = TextClassifierPredictor(model, dataset_reader=build_dataset_reader())

In [19]:
def predict_classes(sentence_list):
    return [id_to_label[np.argmax(predictor.predict(sentence)['probs'])] for sentence in sentence_list]

In [78]:
predicted_classes = np.array(predict_classes(sentences))

In [79]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [80]:
for label, description in label_description.items():
    true_binary = true_classes == label
    if np.sum(true_binary) == 0:
        continue
    predicted_binary = predicted_classes == label
    print(
        f"label ({description})", 
        f"f1_score {f1_score(predicted_binary, true_binary)}", 
        f"precision {precision_score(predicted_binary, true_binary)}", 
        f"recall {recall_score(predicted_binary, true_binary)}", 
    )
print(f"accuracy", accuracy_score(predicted_classes, true_classes))

label (argum) f1_score 0.09411764705882353 precision 0.05194805194805195 recall 0.5
label (fictive) f1_score 0.6 precision 0.42857142857142855 recall 1.0
label (instruct) f1_score 0.3614457831325301 precision 0.3 recall 0.45454545454545453
label (reporting) f1_score 0.1 precision 0.05405405405405406 recall 0.6666666666666666
label (legal) f1_score 0.0 precision 0.0 recall 0.0
label (personal) f1_score 0.3076923076923077 precision 0.42105263157894735 recall 0.24242424242424243
label (commercial) f1_score 0.11267605633802819 precision 0.06060606060606061 recall 0.8
label (research) f1_score 0.13256484149855907 precision 1.0 recall 0.07098765432098765
label (info) f1_score 0.0 precision 0.0 recall 0.0
label (eval) f1_score 0.0 precision 0.0 recall 0.0
accuracy 0.16113744075829384


  _warn_prf(average, modifier, msg_start, len(result))


### Smooth Gradient Interpretation

In [30]:
smooth_grad_interpr = SmoothGradient(predictor)