# "Spam classifier!"
> "Predict Simple spam classifier using allennlp"

- toc: true
- badges: False
- comments: true
- categories: [allennlp]
- hide: false

This is a spam classifier based on the simple spam classifier from an earlier post

In [6]:
#hide
# Define DatasetReader
# DatasetReader
from allennlp.data import DatasetReader, Instance
from allennlp.data.fields import LabelField, TextField


class ClassificationTsvReader(DatasetReader):
    def __init__(
        self,
        tokenizer=None,
        token_indexers=None,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers

    def _read(self, file_path):
        with open(file_path, "r") as lines:
            for line in lines:
                label, text = line.strip().split("\t")

                tokens = self.tokenizer.tokenize(text)

                text_field = TextField(tokens, self.token_indexers)
                label_field = LabelField(label)

                fields = {"tokens": text_field, "label": label_field}
                yield Instance(fields)

    def text_to_instance(self, text: str, label: str = None) -> Instance:
        tokens = self.tokenizer.tokenize(text)
        text_field = TextField(tokens, self.token_indexers)
        fields = {'tokens': text_field}
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)

# Tokenizer
from allennlp.data.tokenizers import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()

# Indexer
from allennlp.data.token_indexers import SingleIdTokenIndexer
token_indexers = {"tokens": SingleIdTokenIndexer()}

# Create DatasetReader
dataset_reader = ClassificationTsvReader(
    tokenizer=tokenizer,
    token_indexers=token_indexers)

# Load model vocab

model_dir = '../spam-classifier/simple-classifier-spam/'

from allennlp.data.vocabulary import Vocabulary
vocab = Vocabulary.from_files(model_dir + "vocabulary")
vocab_size = vocab.get_vocab_size("tokens")

# Define model class

# Token embedder
from allennlp.modules.token_embedders import Embedding
token_embedder=Embedding(embedding_dim=10, num_embeddings=vocab_size)

# Text-field embedder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
embedder = BasicTextFieldEmbedder({"tokens": token_embedder})

# Encoder
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
encoder = BagOfEmbeddingsEncoder(embedding_dim=10)

# Model
from allennlp.models import BasicClassifier
model=BasicClassifier(
    vocab=vocab,
    text_field_embedder=embedder,
    seq2vec_encoder=encoder,
    namespace="tokens",
    label_namespace="labels"
    )

# Load model weights
import torch
with open(model_dir + "model.th", 'rb') as f:
    model.load_state_dict(torch.load(f))
    
# Compile predictor

# Predictor
from allennlp.predictors.text_classifier import TextClassifierPredictor
predictor = TextClassifierPredictor(model=model, dataset_reader=dataset_reader)

# Prepare user interface
import ipywidgets as widgets

input = widgets.Textarea(
    placeholder='Type something',
    disabled=False
);
def f(text):
    return predictor.predict(text)['label']

from ipywidgets import interact

In [7]:
interact(f, text=input);

interactive(children=(Textarea(value='', description='text', placeholder='Type something'), Output()), _dom_cl…