<a href="https://colab.research.google.com/github/JohnnySunkel/BlueSky/blob/master/Text_Classification_with_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import random
import spacy
from spacy.util import minibatch

In [2]:
# Load the spam data ('ham'is the label for non-spam messages)
spam = pd.read_csv('drive/My Drive/spam.csv', encoding = 'latin-1')
spam.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [0]:
# Create an empty model
nlp = spacy.blank('en')

In [0]:
# Create a TextCategorizer with exclusive classes and 'bow' architecture
textcat = nlp.create_pipe('textcat',
                          config = {'exclusive_classes': True,
                                    'architecture': 'bow'})

In [0]:
# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

In [8]:
# Add labels to the text classifier
textcat.add_label('ham')
textcat.add_label('spam')

1

In [0]:
# Convert the labels in the data
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}}
                for label in spam['label']]

In [10]:
# Combine the texts and labels in a single list
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [13]:
# Train the model
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
  random.shuffle(train_data)
  # Create the batch generator with batch size = 8
  batches = minibatch(train_data, size = 8)
  # Iterate through minibatches
  for batch in batches:
    # Each batch is a list of (text, label) but we need to
    # send separate lists for texts and labels to update().
    # This is a quick way to split a list of tuples into lists
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd = optimizer, losses = losses)
  print(losses)

{'textcat': 1.3378591984510422}
{'textcat': 1.6737876627902324}
{'textcat': 1.862642121138876}
{'textcat': 1.9832827621648583}
{'textcat': 2.0642554799028936}
{'textcat': 2.116313516886164}
{'textcat': 2.152848785518172}
{'textcat': 2.177383209271007}
{'textcat': 2.1960814647508977}
{'textcat': 2.2100412751516685}


In [0]:
# Make predictions
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA"]
docs = [nlp.tokenizer(text) for text in texts] 

In [17]:
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)
print(scores)

[[9.9993277e-01 6.7249312e-05]
 [1.8681869e-02 9.8131818e-01]]


In [20]:
# From the scores, find the label with the highest probability
predicted_labels = scores.argmax(axis = 1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']
