https://www.machinelearningplus.com/nlp/custom-text-classification-spacy/

https://www.kaggle.com/poonaml/text-classification-using-spacy#SpaCy-Text-Categorizer

In [1]:
# !pip3 install spacy

In [2]:
import json
import random
import spacy
from spacy.util import minibatch, compounding

Import spaCy ,load model

In [3]:
#!python -m spacy download fr_core_news_sm

In [4]:
nlp = spacy.load('fr_core_news_sm')
nlp.pipe_names

['tagger', 'parser', 'ner']

Adding the built-in textcat component to the pipeline.


In [5]:
textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

Adding the labels to textcat

In [6]:
textcat.add_label("find-train")
textcat.add_label("irrelevant")
textcat.add_label("find-flight")
textcat.add_label("find-restaurant")
textcat.add_label("purchase")
textcat.add_label("find-around-me")
textcat.add_label("provide-showtimes")
textcat.add_label("find-hotel")

1

In [9]:
def load_data(root='data/training_set.json' limit=0, split=0.8):
    with open(root, encoding="utf8") as json_file:
        train_data = json.load(json_file)
    # Shuffle the data
    list_intent = list({sample["intent"] for sample in train_data})
    train_data = [(sample["intent"], sample["sentence"]) for sample in train_data]
    random.shuffle(train_data)
    labels, texts = zip(*train_data)
    # get the categories for each review
    cats = []
    for true_intent in labels:
        tmp_cat = {intent_label : False for intent_label in list_intent}
        tmp_cat[true_intent] = True
        cats.append(tmp_cat)
    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [10]:
# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:5]

[("Le Clos d'André, Montpezat",
  {'cats': {'purchase': False,
    'find-around-me': False,
    'find-restaurant': False,
    'irrelevant': True,
    'find-train': False,
    'provide-showtimes': False,
    'find-flight': False,
    'find-hotel': False}}),
 ('Non et pour le resto?',
  {'cats': {'purchase': False,
    'find-around-me': False,
    'find-restaurant': False,
    'irrelevant': True,
    'find-train': False,
    'provide-showtimes': False,
    'find-flight': False,
    'find-hotel': False}}),
 ("Nous devons faire la carte d'identité de notre petite fille ce sera sa première carte d'identité elle n'a que neuf mois",
  {'cats': {'purchase': False,
    'find-around-me': False,
    'find-restaurant': False,
    'irrelevant': True,
    'find-train': False,
    'provide-showtimes': False,
    'find-flight': False,
    'find-hotel': False}}),
 ('Je peux payer immédiatement',
  {'cats': {'purchase': False,
    'find-around-me': False,
    'find-restaurant': False,
    'irrelevant': 

In [11]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [13]:
epochs = 12

# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()

    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
    for i in range(epochs):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

nlp.to_disk('model_save')

Training the model...
LOSS 	  P  	  R  	  F  
14.307	0.851	0.660	0.743
8.988	0.861	0.771	0.814
5.247	0.874	0.828	0.850
3.243	0.859	0.836	0.848
2.584	0.852	0.831	0.841
1.624	0.851	0.835	0.843
1.433	0.855	0.838	0.847
1.317	0.859	0.846	0.852
1.414	0.852	0.842	0.847
0.926	0.852	0.843	0.847
0.843	0.848	0.838	0.843
0.735	0.842	0.833	0.837


KeyboardInterrupt: 

In [15]:
# Testing the model
test_text="J'ai besoin d'un restaurant italien pour ce soir, nous serons 4 convives"
doc=nlp(test_text)
doc.cats 

{'find-train': 5.353676876818092e-11,
 'irrelevant': 0.0007451469427905977,
 'find-flight': 2.1033903507827745e-08,
 'find-restaurant': 0.9991434812545776,
 'purchase': 2.068640725383375e-08,
 'find-around-me': 3.1434039815536607e-09,
 'provide-showtimes': 1.4067862430522382e-08,
 'find-hotel': 0.00011135380918858573}