In [6]:
import pandas as pd 
import spacy

In [4]:
data = pd.read_csv("data\spam.csv")
data.head(3)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


## Bag of Words

In [7]:
# create an empty model 
nlp = spacy.blank("en")

# add the text tokenizer to the empty model
textcat = nlp.add_pipe("textcat")

In [8]:
# add labels to text clf
textcat.add_label("ham")
textcat.add_label("spam")

1

## Training the text tokenizer model

In [9]:
train_text = data['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}}
                for label in data['label']]

In [10]:
train_data = list(zip(train_text, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [13]:
from spacy.util import minibatch
from spacy.training.example import Example

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# create batch generator size = 8
batches = minibatch(train_data, size = 8)

for batch in batches:
    # each batch is a list of text and label
    for text, label in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, label)
        nlp.update([example], sgd = optimizer)

In [None]:
import random 
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(20):
    random.shuffle(train_data)
    batches = minibatch(train_data, size = 8)
    for batch in batches:
        for text, label in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, label)
            nlp.update([example], sgd = optimizer, losses = losses)

In [None]:
print losses

## Making predictions

In [None]:
texts = ["Are you ready for the tea party????? It's gonna be wild", 
         "URGENT Reply to this message for GUARANTEED FREE TEA"]

docs = [nlp.tokenizer(text) for text in texts]

textcat = nlp.get_pipe('textcat')
scores = textcat.predict(docs)
print(scores)

In [None]:
# from scores, get label with the highest score/probability

predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])