# "Simple spam classifier"
> "Simple spam classifier using allennlp"

- toc: true
- badges: False
- comments: true
- categories: [allennlp]
- hide: false

This is as simple-as-it-gets classifier, implemented using allennlp 2.4.0. 

Dataset: https://www.kaggle.com/team-ai/spam-text-message-classification

In [1]:
#hide
!pip install allennlp



In [2]:
# DatasetReader
from allennlp.data import DatasetReader, Instance
from allennlp.data.fields import LabelField, TextField


class ClassificationTsvReader(DatasetReader):
    def __init__(
        self,
        tokenizer=None,
        token_indexers=None,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers

    def _read(self, file_path):
        with open(file_path, "r") as lines:
            for line in lines:
                label, text = line.strip().split("\t")

                tokens = self.tokenizer.tokenize(text)

                text_field = TextField(tokens, self.token_indexers)
                label_field = LabelField(label)

                fields = {"tokens": text_field, "label": label_field}
                yield Instance(fields)

    def text_to_instance(self, text: str, label: str = None) -> Instance:
        tokens = self.tokenizer.tokenize(text)
        text_field = TextField(tokens, self.token_indexers)
        fields = {'tokens': text_field}
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)

In [3]:
# Tokenizer
from allennlp.data.tokenizers import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()

In [4]:
# Indexer
from allennlp.data.token_indexers import SingleIdTokenIndexer
token_indexers = {"tokens": SingleIdTokenIndexer()}

In [6]:
# Reading instances

train_file='data/spam-train.tsv'
dev_file='data/spam-dev.tsv'

dataset_reader = ClassificationTsvReader(
    tokenizer=tokenizer,
    token_indexers=token_indexers)

train_instances = list(dataset_reader.read(train_file))
dev_instances = list(dataset_reader.read(dev_file))

In [7]:
# Vocabulary
from allennlp.data.vocabulary import Vocabulary

vocab = Vocabulary.from_instances(train_instances + dev_instances)
vocab_size = vocab.get_vocab_size("tokens")

HBox(children=(FloatProgress(value=0.0, description='building vocab', max=5577.0, style=ProgressStyle(descript…




In [8]:
# Token embedder
from allennlp.modules.token_embedders import Embedding
token_embedder=Embedding(embedding_dim=10, num_embeddings=vocab_size)

In [9]:
# Text-field embedder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
embedder = BasicTextFieldEmbedder({"tokens": token_embedder})

In [10]:
# Encoder
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
encoder = BagOfEmbeddingsEncoder(embedding_dim=10)

In [11]:
# Model
from allennlp.models import BasicClassifier

model=BasicClassifier(
    vocab=vocab,
    text_field_embedder=embedder,
    seq2vec_encoder=encoder,
    namespace="tokens",
    label_namespace="labels"
    )

In [12]:
# DataLoaders
from allennlp.data.data_loaders import SimpleDataLoader

train_data_loader=SimpleDataLoader(
    instances=train_instances,
    batch_size=16,
    shuffle=True,
    vocab=vocab,
    )

dev_data_loader=SimpleDataLoader(
    instances=dev_instances,
    batch_size=16,
    shuffle=True,
    vocab=vocab
    )

In [13]:
# Trainer
from allennlp.training import GradientDescentTrainer
from torch.optim import Adam

trainer = GradientDescentTrainer(
    model=model,
    optimizer=Adam(model.parameters()),
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=3,
    num_epochs=20
)

In [14]:
# Preparing GPU training
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.to(device)
train_data_loader.set_target_device(device)
dev_data_loader.set_target_device(device)

In [18]:
# Train the model
%time trainer.train()

HBox(children=(FloatProgress(value=0.0, max=279.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))

CPU times: user 1.98 s, sys: 105 ms, total: 2.09 s
Wall time: 2.8 s





{'best_epoch': 4,
 'best_validation_accuracy': 0.9928443649373881,
 'best_validation_loss': 0.0313081862130535,
 'peak_worker_0_memory_MB': 256.30078125,
 'training_duration': '0:00:02.795020',
 'training_start_epoch': 0,
 'training_epochs': 0,
 'epoch': 0,
 'training_accuracy': 1.0,
 'training_loss': 0.001972834501566956,
 'training_worker_0_memory_MB': 256.30078125,
 'validation_accuracy': 0.9928443649373881,
 'validation_loss': 0.03381697619708055}

In [17]:
# Predictor
from allennlp.predictors.text_classifier import TextClassifierPredictor
predictor = TextClassifierPredictor(model=model, dataset_reader=dataset_reader)

In [19]:
# Test predictor
predictor.predict("hello world")

{'logits': [1.1957952976226807, -1.0361652374267578],
 'probs': [0.903083086013794, 0.09691690653562546],
 'token_ids': [3614, 618],
 'label': 'ham',
 'tokens': ['hello', 'world']}

In [51]:
# Save model

model_dir = 'simple-classifier-spam/'

with open(model_dir + "model.th", 'wb') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files(model_dir + "vocabulary")