https://www.machinelearningplus.com/nlp/custom-text-classification-spacy/

https://www.kaggle.com/poonaml/text-classification-using-spacy#SpaCy-Text-Categorizer

In [12]:
!pip3 install spacy

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import json
import random
import spacy
from spacy.util import minibatch, compounding


Import spaCy ,load model

In [13]:
!python3 -m spacy download fr_core_news_sm

Defaulting to user installation because normal site-packages is not writeable
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')


In [2]:
nlp = spacy.load('fr_core_news_sm')
nlp.pipe_names

['tagger', 'parser', 'ner']

Adding the built-in textcat component to the pipeline.


In [3]:
textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

Adding the labels to textcat

In [4]:
textcat.add_label("find-train")
textcat.add_label("irrelevant")
textcat.add_label("find-flight")
textcat.add_label("find-restaurant")
textcat.add_label("purchase")
textcat.add_label("find-around-me")
textcat.add_label("provide-showtimes")
textcat.add_label("find-hotel")

1

In [5]:
def load_data(limit=0, split=0.8):
    with open("training_set.json") as json_file:
        train_data = json.load(json_file)
    # Shuffle the data
    list_intent = list({sample["intent"] for sample in train_data})
    train_data = [(sample["intent"], sample["sentence"]) for sample in train_data]
    random.shuffle(train_data)
    labels, texts = zip(*train_data)
    # get the categories for each review
    cats = []
    for true_intent in labels:
        tmp_cat = {intent_label : False for intent_label in list_intent}
        tmp_cat[true_intent] = True
        cats.append(tmp_cat)
    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [10]:
# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:5]

[("J'ai les 30 ans de mon copain ce week-end et j'ai complètement oublié d'acheter de la déco. Tu peux me trouver un kit anniversaire 30 ans sur Amazon. Merciii",
  {'cats': {'find-around-me': False,
    'find-flight': False,
    'irrelevant': False,
    'find-restaurant': False,
    'find-train': False,
    'purchase': True,
    'provide-showtimes': False,
    'find-hotel': False}}),
 ('Et pendant les semaines 27 et 28 tu peux chercher',
  {'cats': {'find-around-me': False,
    'find-flight': False,
    'irrelevant': True,
    'find-restaurant': False,
    'find-train': False,
    'purchase': False,
    'provide-showtimes': False,
    'find-hotel': False}}),
 ('Le lundi quel est le programme',
  {'cats': {'find-around-me': False,
    'find-flight': False,
    'irrelevant': True,
    'find-restaurant': False,
    'find-train': False,
    'purchase': False,
    'provide-showtimes': False,
    'find-hotel': False}}),
 ('Comme je m’y prend au dernier moment je vais devoir aller en boutiqu

In [7]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [8]:
n_iter = 10

In [9]:

# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()

    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

Training the model...
LOSS 	  P  	  R  	  F  
14.180	0.851	0.694	0.765
8.929	0.845	0.780	0.811
5.242	0.841	0.809	0.825
3.176	0.844	0.817	0.830
2.620	0.838	0.819	0.829
1.668	0.833	0.821	0.827
1.528	0.838	0.821	0.829
1.101	0.830	0.819	0.825
1.027	0.835	0.826	0.830
1.123	0.836	0.827	0.831


In [11]:
# Testing the model
test_text="J'ai besoin d'un restaurant italien pour ce soir, nous serons 4 convives"
doc=nlp(test_text)
doc.cats 

{'find-train': 1.2674707117188433e-10,
 'irrelevant': 9.379066341352882e-07,
 'find-flight': 1.9730082012614503e-09,
 'find-restaurant': 0.999987006187439,
 'purchase': 3.284701961092651e-06,
 'find-around-me': 1.4785069835099307e-09,
 'provide-showtimes': 1.1706024594104747e-08,
 'find-hotel': 8.780206371739041e-06}