In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Dropout, Activation, Input, MaxPooling1D, Conv1D, Flatten
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb
from keras.utils.np_utils import to_categorical
from keras import layers
from keras.optimizers import Adam, rmsprop

import matplotlib.pyplot as plt

import pickle

Using TensorFlow backend.


In [2]:
plt.style.use('ggplot')

# Function to plot results of a model
def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [3]:
# Load data
with open('DATA.pkl', 'rb') as handle:
    X, Y, vocabs = pickle.load(handle)
    
X = np.array(X)

In [4]:
# Transform multilablels Y in sparse matrix for sklearn
mlb = MultiLabelBinarizer(sparse_output=False)
Y_mlb = mlb.fit_transform(Y)

In [53]:
# Parameters
MAX_SEQUENCE_LENGTH = 2000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [6]:
# Tokenization keras : map chaque mot présent dans les textes à un nombre
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

Number of Unique Tokens 88397


In [7]:
# Nb de mots par documents
len_X = [len(x) for x in sequences]

In [8]:
np.histogram(len_X, bins=100)

(array([465,  94,  34,  15,  14,   5,   3,   5,   7,   0,   0,   2,   1,
          0,   2,   0,   0,   1,   2,   0,   1,   2,   0,   1,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   0,   0,   0,   0,   0,   1]),
 array([     0.  ,   1452.55,   2905.1 ,   4357.65,   5810.2 ,   7262.75,
          8715.3 ,  10167.85,  11620.4 ,  13072.95,  14525.5 ,  15978.05,
         17430.6 ,  18883.15,  20335.7 ,  21788.25,  23240.8 ,  24693.35,
         26145.9 ,  27598.45,  29051.  ,  30503.55,  31956.1 ,  33408.65,
         34861.2 ,  36313.75,  37766.3 ,  39218.85,  40671.4 ,  42123.95,
         43576.5 ,  45029.05,  46481.6 ,  47934.15,  49386.7 ,  

In [9]:
# On pad les documents pour qu'ils aient tous la même taille de MAX_SEQUENCE_LENGTH
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of Data Tensor:', data.shape)

Shape of Data Tensor: (656, 2000)


In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(data, Y_mlb, test_size=0.25, random_state=1)

In [54]:
# Load pre-trained GLOVE embeddings
embeddings_index = {}
f = open('glove.6B.300d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 300d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 300d.


In [55]:
# Map the embeddings with the words indexes and load an embedding layer
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [56]:
# Creation du modèle CNN
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(len(Y_mlb[0]), activation='sigmoid')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy', metrics=["accuracy"],
              optimizer=rmsprop())

print("Simplified convolutional neural network")
model.summary()
#cp=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Simplified convolutional neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 2000)              0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 2000, 300)         26519400  
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 1996, 128)         192128    
_________________________________________________________________
max_pooling1d_19 (MaxPooling (None, 399, 128)          0         
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 395, 128)          82048     
_________________________________________________________________
max_pooling1d_20 (MaxPooling (None, 79, 128)           0         
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 

In [57]:
history=model.fit(X_train, y_train, validation_data=(X_valid, y_valid),epochs=10, batch_size=5)

Train on 492 samples, validate on 164 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [58]:
model.evaluate(X_valid, y_valid)



[0.15852259717336514, 0.9644072462872761]

In [59]:
preds = model.predict(X_valid)

In [60]:
preds[20]

array([4.63691354e-03, 1.58578157e-03, 1.57746673e-02, 7.53998756e-05,
       3.38536799e-02, 6.94960356e-04, 2.18749046e-04, 6.31242990e-04,
       1.31171942e-03, 2.92062759e-06, 1.31130219e-06, 1.04227662e-03,
       4.23312187e-04, 4.76837158e-05, 4.83870506e-04, 1.71446800e-03,
       4.94956970e-04, 4.29153442e-05, 3.60012054e-05, 7.90933669e-02,
       5.21773100e-03, 2.02158093e-03, 1.21682882e-04, 1.29103661e-04,
       5.96046448e-07, 1.31657720e-03, 5.12927771e-04, 1.59133703e-01,
       1.23244226e-02, 1.60932541e-06, 1.34261787e-01, 1.25467777e-05,
       7.68899918e-06, 1.01077557e-03, 8.18440318e-03, 5.25400043e-03,
       4.07648087e-03, 1.48117542e-05, 2.98023224e-08, 4.02331352e-06,
       1.04883015e-02, 1.74617171e-02, 5.02824783e-04], dtype=float32)

In [61]:
preds_int = np.zeros_like(preds)
preds_int[preds>=0.5] = 1
preds_int[preds<0.5] = 0

In [62]:
preds_int

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [63]:
from sklearn.metrics import f1_score

f1_score(y_valid, preds_int, average="micro")

0.059925093632958795

In [64]:
def cust_metric(y_true, y_pred):
    row_maxs = y_pred.max(axis=1, keepdims=True)
    maxis = np.where(y_pred == row_maxs, 1, 0)

In [65]:
row_maxs = preds.max(axis=1, keepdims=True)
maxis = np.where(preds == row_maxs, 1, 0)
check = y_valid[maxis == 1]
np.mean(check)
#np.where(a == row_maxes).astype(int)

0.09146341463414634

In [241]:
print(preds[6])
#print(maxis[0])
print(preds.max(axis=1, keepdims=True))

[0.01333001 0.00752717 0.01215923 0.00122723 0.02035478 0.0052467
 0.03618518 0.0140928  0.00968164 0.00032535 0.00137419 0.02849156
 0.07415041 0.02470866 0.01308507 0.00240651 0.00574085 0.03909755
 0.01878512 0.04676759 0.09884825 0.00591114 0.00367641 0.00859213
 0.00017795 0.04916164 0.00138214 0.03059036 0.01334745 0.00021717
 0.01634333 0.0010317  0.00446758 0.02561405 0.04847297 0.02461442
 0.00649095 0.01161435 0.00094911 0.00284097 0.03039992 0.01263708
 0.00903228]
[0.11438799 0.13647974 0.12322527 0.14172405 0.11731407 0.11743912
 0.09884825 0.13318169 0.11369902 0.14408836 0.12593278 0.10090151
 0.11628634 0.12908185 0.12012365 0.12707254 0.12643197 0.11636412
 0.1157597  0.11926961 0.10910758 0.11495519 0.11769584 0.11668715
 0.13547331 0.14047033 0.12012365 0.11812368 0.11942065 0.11653465
 0.11468142 0.11641157 0.11286855 0.13846344 0.12726185 0.11819544
 0.11703414 0.11418089 0.11628655 0.11277321 0.11781695 0.13115063
 0.11593491 0.11848027 0.10343263 0.11087468 0.115