In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pickle

from data_processing import *

from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.metrics import multilabel_confusion_matrix, balanced_accuracy_score
from sklearn.utils import class_weight

from keras.models import Sequential, load_model
from keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout, BatchNormalization, LSTM, InputLayer, Flatten
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MathildeElimas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MathildeElimas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MathildeElimas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using TensorFlow backend.


In [2]:
PATH = os.getcwd() + '\\data'
method = 'count'

In [3]:
# intent, unique_intent, sentences = load_data(PATH + '\\training_recast2.csv', 
#                                              sep = ';', header = 1, encoding = "cp1252", 
#                                              names = ["Sentence", "Intent", "Description"], index_col= False)

intent, unique_intent, sentences = load_data(PATH + '\\intent_dialogues.csv')

                                            Sentence           Intent
0  Bonjour j'ai rendu mon équipement box est je n...  probleme_box_tv
1                                            Bonjour  probleme_box_tv
2                              Box internet en panne  probleme_box_tv
3  Plus de phone ni internet depuis plus ou moins...  probleme_box_tv
4  Bonjour j'ai la télécommande de ma box qui ne ...  probleme_box_tv


#### Text embedding

In [4]:
idx, cleaned_words = clean_sentences(sentences)
intent = list(map(intent.__getitem__, idx))

2020-10-27 15:21:04,874 - spacy_lefff.lefff - INFO - New LefffLemmatizer instantiated.
2020-10-27 15:21:04,875 - spacy_lefff.lefff - INFO - Reading lefff data...
2020-10-27 15:21:05,273 - spacy_lefff.lefff - INFO - Successfully loaded lefff lemmatizer


In [5]:
vocab = create_vocabulary(cleaned_words)

2020-10-27 15:21:31,687 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


In [6]:
encoded_doc = text_embedding(cleaned_words, vocab = list(vocab.keys()), encoding = 'word2vec')

In [7]:
max_length = len(max(cleaned_words, key = len))
# vocab_size = encoded_doc.shape[1]
# print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

In [8]:
encoded_doc.shape

(2855, 34, 300)

#### Intent embedding

In [9]:
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
output_tokenizer.word_index

{'probleme_box_tv': 1, 'reseau_mobile': 2, 'explication_facture': 3}

In [10]:
intent_encoder, encoded_output = intent_embedding(intent)

In [11]:
encoded_output

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [12]:
intents = intent_encoder.categories_

In [13]:
intents

[array(['explication_facture', 'probleme_box_tv', 'reseau_mobile'],
       dtype='<U19')]

### Neural Network

#### Train/test

In [14]:
train_X, val_X, train_Y, val_Y = train_test_split(encoded_doc, encoded_output, test_size = 0.2)

In [15]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (2284, 34, 300) and train_Y = (2284, 3)
Shape of val_X = (571, 34, 300) and val_Y = (571, 3)


#### Model definition and training

In [16]:
def create_model():
    model = Sequential()

    model.add(InputLayer(input_shape = encoded_doc.shape[1:]))
#     model.add(Embedding(vocab_size, 64))
#    model.add(Bidirectional(LSTM(128))) 
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.2))
#     model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(len(unique_intent), activation = "softmax")) 

    return model

In [17]:
model = create_model()

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["categorical_accuracy"])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 34, 64)            19264     
_________________________________________________________________
dropout_1 (Dropout)          (None, 34, 64)            0         
_________________________________________________________________
dense_2 (Dense)              (None, 34, 64)            4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 34, 64)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2176)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 6531      
Total params: 29,955
Trainable params: 29,955
Non-trainable params: 0
__________________________________________________

In [18]:
train_X.shape

(2284, 34, 300)

In [19]:
labels = [np.argmax(x) for x in train_Y]
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(labels),
                                                 labels)

In [20]:
filename = os.getcwd() + '/model.h5'
checkpoint = ModelCheckpoint(filename, monitor='categorical_accuracy', 
                             verbose=1, save_best_only=True, 
                             mode='max')

hist = model.fit(train_X, train_Y, #class_weight = class_weights,
                 epochs = 50, batch_size = 32, 
                 callbacks = [checkpoint])

Epoch 1/50

Epoch 00001: categorical_accuracy improved from -inf to 0.48030, saving model to C:\Users\MathildeElimas\OneDrive - Datatorii\Documents\BYTEL\code\intent_classification/model.h5
Epoch 2/50

Epoch 00002: categorical_accuracy improved from 0.48030 to 0.66287, saving model to C:\Users\MathildeElimas\OneDrive - Datatorii\Documents\BYTEL\code\intent_classification/model.h5
Epoch 3/50

Epoch 00003: categorical_accuracy improved from 0.66287 to 0.72723, saving model to C:\Users\MathildeElimas\OneDrive - Datatorii\Documents\BYTEL\code\intent_classification/model.h5
Epoch 4/50

Epoch 00004: categorical_accuracy improved from 0.72723 to 0.74869, saving model to C:\Users\MathildeElimas\OneDrive - Datatorii\Documents\BYTEL\code\intent_classification/model.h5
Epoch 5/50

Epoch 00005: categorical_accuracy improved from 0.74869 to 0.76970, saving model to C:\Users\MathildeElimas\OneDrive - Datatorii\Documents\BYTEL\code\intent_classification/model.h5
Epoch 6/50

Epoch 00006: categorical_a


Epoch 00031: categorical_accuracy improved from 0.88004 to 0.88398, saving model to C:\Users\MathildeElimas\OneDrive - Datatorii\Documents\BYTEL\code\intent_classification/model.h5
Epoch 32/50

Epoch 00032: categorical_accuracy did not improve from 0.88398
Epoch 33/50

Epoch 00033: categorical_accuracy did not improve from 0.88398
Epoch 34/50

Epoch 00034: categorical_accuracy improved from 0.88398 to 0.88573, saving model to C:\Users\MathildeElimas\OneDrive - Datatorii\Documents\BYTEL\code\intent_classification/model.h5
Epoch 35/50

Epoch 00035: categorical_accuracy improved from 0.88573 to 0.89317, saving model to C:\Users\MathildeElimas\OneDrive - Datatorii\Documents\BYTEL\code\intent_classification/model.h5
Epoch 36/50

Epoch 00036: categorical_accuracy did not improve from 0.89317
Epoch 37/50

Epoch 00037: categorical_accuracy did not improve from 0.89317
Epoch 38/50

Epoch 00038: categorical_accuracy did not improve from 0.89317
Epoch 39/50

Epoch 00039: categorical_accuracy did

#### Model evaluation

Test with test dataset

In [22]:
# model = load_model(filename)

In [23]:
pred_proba = model.predict(val_X)
pred_class = [np.argmax(x) for x in pred_proba]
val_Y_class = [np.argmax(x) for x in val_Y]
balanced_accuracy_score(val_Y_class, pred_class)

0.6491697731253284

In [24]:
def multiclass_conf_mat(true, pred_proba, seuil):
    pred = [[np.argmax(x), max(x)] for x in pred_proba]
    VP, FP, VN, FN = 0, 0, 0, 0
    for i in range(len(true)):
        if pred[i][1] >= seuil:
            if pred[i][0] == true[i]:
                VP += 1
            else :
                FP += 1
        else : 
            if pred[i][0] == true[i]:
                VN += 1
            else :
                FN += 1 
    return np.array([[VP,FP],[VN,FN]])
    

In [25]:
multiclass_conf_mat(val_Y, pred_proba, 0.33)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Test with a sentence

In [None]:
def predictions(text, vocab = vocab, encoding = 'count'):
    clean = clean_sentences(text)
    _, test_ls = text_embedding(clean, vocab, encoding)
    
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))

#     test_ls = np.array(test_ls).reshape(1, len(test_ls))

    pred = model.predict_proba(test_ls)
    return pred

In [None]:
text = "Je ne comprends pas ma dernière facture. Il y a une augmentation de 2euros."
pred_text = predictions([text])
intent_encoder.inverse_transform(pred_text)