In [8]:
!pip3 install spacy

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [9]:
import json
import random
import spacy
from spacy.util import minibatch, compounding

Import spaCy ,load model

In [10]:
!python3 -m spacy download fr_core_news_sm

Defaulting to user installation because normal site-packages is not writeable
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')


In [11]:
nlp = spacy.load('fr_core_news_sm')
nlp.pipe_names

['tagger', 'parser', 'ner']

Adding the built-in textcat component to the pipeline.


In [12]:
textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

Adding the labels to textcat

In [13]:
textcat.add_label("find-train")
textcat.add_label("irrelevant")
textcat.add_label("find-flight")
textcat.add_label("find-restaurant")
textcat.add_label("purchase")
textcat.add_label("find-around-me")
textcat.add_label("provide-showtimes")
textcat.add_label("find-hotel")

1

In [15]:
def load_data(root='data/training_set.json', limit=0, split=0.8):
    with open(root, encoding="utf8") as json_file:
        train_data = json.load(json_file)
    # Shuffle the data
    list_intent = list({sample["intent"] for sample in train_data})
    train_data = [(sample["intent"], sample["sentence"]) for sample in train_data]
    random.shuffle(train_data)
    labels, texts = zip(*train_data)
    # get the categories for each review
    cats = []
    for true_intent in labels:
        tmp_cat = {intent_label : False for intent_label in list_intent}
        tmp_cat[true_intent] = True
        cats.append(tmp_cat)
    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [16]:
# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:5]

[("j'aimerais acheter des places pour le Stade Velodrome pour aller voir Marseille Lyon",
  {'cats': {'irrelevant': False,
    'purchase': True,
    'find-flight': False,
    'find-train': False,
    'provide-showtimes': False,
    'find-hotel': False,
    'find-around-me': False,
    'find-restaurant': False}}),
 ('Bonjour Idiwii, tu aurais des idées pour un anniversaire ?',
  {'cats': {'irrelevant': True,
    'purchase': False,
    'find-flight': False,
    'find-train': False,
    'provide-showtimes': False,
    'find-hotel': False,
    'find-around-me': False,
    'find-restaurant': False}}),
 ('Rappele le service de livraison stp',
  {'cats': {'irrelevant': True,
    'purchase': False,
    'find-flight': False,
    'find-train': False,
    'provide-showtimes': False,
    'find-hotel': False,
    'find-around-me': False,
    'find-restaurant': False}}),
 ('Je recherche un hôtel dans le 14ème pour 30e la nuit',
  {'cats': {'irrelevant': False,
    'purchase': False,
    'find-flight

In [17]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [23]:
epochs = 100

# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()

    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
    for i in range(epochs):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))


Training the model...
LOSS 	  P  	  R  	  F  
0.790	0.841	0.838	0.839
0.360	0.841	0.838	0.839
0.421	0.849	0.841	0.845
0.378	0.852	0.847	0.850
0.369	0.849	0.845	0.847
0.303	0.852	0.848	0.850
0.362	0.851	0.849	0.850
0.364	0.848	0.844	0.846
0.255	0.857	0.852	0.854
0.313	0.854	0.848	0.851
0.213	0.848	0.846	0.847
0.298	0.854	0.850	0.852
0.141	0.860	0.853	0.856
0.209	0.862	0.856	0.859
0.222	0.858	0.855	0.856
0.154	0.856	0.854	0.855
0.085	0.860	0.858	0.859
0.157	0.855	0.853	0.854
0.046	0.854	0.852	0.853
0.305	0.857	0.852	0.854
0.241	0.853	0.853	0.853
0.179	0.851	0.849	0.850
0.252	0.850	0.848	0.849
0.136	0.849	0.846	0.847
0.197	0.848	0.845	0.846
0.114	0.848	0.848	0.848
0.137	0.847	0.847	0.847
0.164	0.849	0.848	0.848
0.112	0.850	0.848	0.849
0.077	0.855	0.852	0.853
0.189	0.853	0.849	0.851
0.090	0.854	0.853	0.853
0.091	0.854	0.851	0.852
0.238	0.859	0.855	0.857
0.142	0.856	0.855	0.856
0.098	0.856	0.854	0.855
0.089	0.860	0.857	0.858
0.133	0.858	0.855	0.856
0.089	0.857	0.856	0.857
0.222	0.858	0.856	

In [24]:
nlp.to_disk('model_save')

In [25]:
# Testing the model
test_text="J'ai besoin d'un restaurant italien pour ce soir, nous serons 4 convives"
doc=nlp(test_text)
doc.cats 

{'find-train': 3.590507865390514e-15,
 'irrelevant': 4.926199639182505e-12,
 'find-flight': 2.5922667455303287e-17,
 'find-restaurant': 1.0,
 'purchase': 3.831307233276293e-12,
 'find-around-me': 5.606108737226268e-17,
 'provide-showtimes': 3.2892091404479766e-15,
 'find-hotel': 1.8262026277044568e-13}