A common task in NLP is text classification. This is "classification" in the conventional machine learning sense, and it is applied to text. Examples include spam detection, sentiment analysis, and tagging customer queries.

In [1]:
pip install spacy




In [2]:
import pandas as pd 
import spacy

In [3]:
txt_class = pd.read_csv('spam.csv')
txt_class.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
nlp = spacy.blank('en')# extract info
txt_cat = nlp.add_pipe("textcat")


In [5]:
# labels
txt_cat.add_label('ham')


1

In [6]:
txt_cat.add_label('spam')

1

## TRAINING TEXT MODEL 

In [7]:
train_txt = txt_class['text'].values
train_txt

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ..., 'Pity, * was in mood for that. So...any other suggestions?',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
       'Rofl. Its true to its name'], dtype=object)

In [8]:
train_lbl = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} 
                for label in txt_class['label']]

In [9]:
train_lbl

[{'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': False, 'spam': True}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': False, 'spam': True}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': False, 'spam': True}},
 {'cats': {'ham': False, 'spam': True}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': False, 'spam': True}},
 {'cats': {'ham': False, 'spam': True}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': False, 'spam': True}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': False, 'spam': True}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham': True, 'spam': False}},
 {'cats': {'ham'

In [10]:
# combine text and label in single list 
train_list = list(zip(train_txt, train_lbl))
train_list[:5]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}}),
 ('U dun say so early hor... U c already then say...',
  {'cats': {'ham': True, 'spam': False}}),
 ("Nah I don't think he goes to usf, he lives around here though",
  {'cats': {'ham': True, 'spam': False}})]

### Now lets train the model 
- Its more efficient to train model in batches
- Spacy gives us a func minibatch to train model in batches 
- minibatch splits into texts and labels

In [11]:
from spacy.util import minibatch
from spacy .training.example import Example

In [12]:
spacy.util.fix_random_seed(1)
opt = nlp.begin_training

In [13]:
from spacy.util import minibatch
from spacy.training.example import Example

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Create the batch generator with batch size = 8
batches = minibatch(train_list, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) 
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd=optimizer)

The model will typically need multiple epochs. Use another loop for more epochs

In [14]:
import random 
random.seed(1)
spacy.util.fix_random_seed(1)
opt = nlp.begin_training()

In [15]:
losses = {}

for epoch in range(10):
    random.shuffle(train_list)
    batches = minibatch(train_list, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) 
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd=optimizer, losses=losses)


## PREDICTION

Now model is trained  and its time to predict.

In [17]:
txt_class['text'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [27]:
txt = ['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
docs = [nlp.tokenizer(text) for text in txt]
txt_cat = nlp.get_pipe('textcat')
scor = txt_cat.predict(docs)

In [28]:
scor

array([[9.9998748e-01, 1.2460619e-05]], dtype=float32)

In [31]:
# From the scores, find the label with the highest score/probability
predicted_labels = scor.argmax(axis=1)
print([txt_cat.labels[label] for label in predicted_labels])

['ham']
