In [50]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding, LSTM
import os


# LSTM with embedding layer

In [90]:
X = pd.read_csv('mails_ohe.csv', index_col = 0)

In [None]:
X = X[X.demande_de_support != 2]
y = X.demande_de_support
X = X.drop('demande_de_support', axis = 1)

In [93]:
print(X.shape, y.shape)

(3210, 10826) (3210,)


In [94]:
# fix random seed for reproducibility
np.random.seed(777)

voc_size = X.shape[1]
embedding_vector_features=100

model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features))
model.add(LSTM(128,activation='relu',return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 300)         3247800   
_________________________________________________________________
lstm_18 (LSTM)               (None, None, 128)         219648    
_________________________________________________________________
dropout_19 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
lstm_19 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_20 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 32)                4128      
_________________________________________________________________
dropout_21 (Dropout)         (None, 32)               

In [95]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)

In [None]:
model.fit(X_train,y_train,validation_data=(X_val, y_val), epochs=1, batch_size=128)

In [32]:
model.evaluate(X_test,y_test)



[0.3905010223388672, 0.8642172813415527]

# LSTM on sequences

In [None]:
X_seq = [ [k for k in range(len(mail)) if mail[k]>0] for mail in X.values ]
MAX_LENGTH = max([ len(seq) for seq in X_seq])
X_seq = pad_sequences(X_seq, maxlen = MAX_LENGTH)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)

In [10]:
model.fit(X_train,y_train,validation_data=(X_val, y_val), epochs=1, batch_size=128)



<keras.callbacks.History at 0x1ba0e62f520>

In [11]:
model.evaluate(X_test,y_test)



[nan, 0.8642172813415527]

In [12]:
input_mail = """
    DRS - 
    Bonjour
    j'ai un souci avec le décisionnel SYNERGIE, il y a un problème de référenciel.

    Cordialement,
    Martin
    """

In [42]:
lem_mail = ['drs', 'bonjour', 'souci', 'decisionnel', 'decibel', 'probleme', 'ref']
seq = [[word_to_idx[word] for word in lem_mail]]
input_seq = pad_sequences(seq)
input_seq

array([[3934,   50,  237,   18,    9,    9,  222, 2181]])

In [43]:
model.predict(input_seq)

array([[0.6208053, 0.3791947]], dtype=float32)