In [11]:
import joblib
from gensim.models.keyedvectors import KeyedVectors
import pandas as pd
from keras.models import load_model
from preprocessing import preprocessing_df
import numpy as np
import keras.backend as K
import warnings
warnings.simplefilter('ignore')

In [2]:
def custom_f1(y_true, y_pred):    
    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        
        recall = TP / (Positives+K.epsilon())    
        return recall 
    
    
    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
        precision = TP / (Pred_Positives+K.epsilon())
        return precision 
    
    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [33]:
class Predicteur:
    def __init__(self):
        self.data = pd.read_json('data/mails_preprocessing.json')
        # self.embedding = KeyedVectors.load('models/MELUSINE_EMBEDDING')
        self.features = [self.data.columns[3]] + list(self.data.columns[5:])

        # self.loaded_cnn = joblib.load('models/CNN_518features_input_text_lem.pkl')
        # self.loaded_cnn.load_nn_model('models/CNN_518features_input_text_lem')

        self.loaded_bigru = joblib.load('models/RNN_518features_input_text_lem.pkl')
        self.loaded_bigru.load_nn_model('models/RNN_518features_input_text_lem')

        self.loaded_transf = joblib.load('models/TRANSF_518features_input_text_lem.pkl')
        self.loaded_transf.load_nn_model('models/TRANSF_518features_input_text_lem')

        self.loaded_lstm = load_model('models/CNN_LSTM_518features_input_emb_meta', custom_objects={'custom_f1':custom_f1})

        self.loaded_bilstm = load_model('models/BILSTM_518features_input_emb_meta', custom_objects={'custom_f1':custom_f1})

        self.d2v_wv = KeyedVectors.load_word2vec_format('models/d2v_model.bin', binary=True)

        self.features_scaler = joblib.load('models/features_scaler')
        self.embeddings_scaler = joblib.load('models/embedding_scaler')
    
    def FunctionText2Vec(self, input_df, word2vec_wv, input_col='text_lem'):
        # Converting the text to numeric data
        X = [str(sentence).split(' ') for sentence in input_df[input_col]]
        
        # Creating empty dataframe to hold sentences
        W2Vec_Data=pd.DataFrame()
        
        # Looping through each row for the data
        for sentence in X:
    
            # initiating a sentence with all zeros
            Embedded_sentence = np.zeros(word2vec_wv.vector_size)
    
            for word in sentence:
                if word in word2vec_wv.key_to_index.keys():    
                    Embedded_sentence = Embedded_sentence + word2vec_wv[word]
            Embedded_sentence.reshape(1, word2vec_wv.vector_size)

            W2Vec_Data=W2Vec_Data.append(pd.DataFrame([Embedded_sentence]))

        return W2Vec_Data
    
    def predict(self, INPUT_MAIL_DF):
        df = preprocessing_df(INPUT_MAIL_DF)
        df_features = df.iloc[:,2::].drop('text_lem', axis = 1)

        df_embeddings = self.FunctionText2Vec(df, self.d2v_wv, "text_lem")
        df_embeddings.reset_index(inplace=True, drop=True)
        
        X_embeddings = self.embeddings_scaler.transform(df_embeddings)
        X_features = self.features_scaler.transform(df_features)

        X_embeddings = np.reshape(X_embeddings, (X_embeddings.shape[0], 1, X_embeddings.shape[1]))
        X_features = np.reshape(X_features, (X_features.shape[0], 1, X_features.shape[1]))

        #pred_melusine_cnn = self.loaded_cnn.predict_proba(df)
        pred_melusine_bigru = self.loaded_bigru.predict_proba(df)
        pred_melusine_transf = self.loaded_transf.predict_proba(df)
        pred_keras_lstm = self.loaded_lstm.predict([X_embeddings, X_features])[:,0]
        pred_keras_bilstm = self.loaded_bilstm.predict([X_embeddings, X_features])[:,0]

        # probas de la classe 1 que l'on moyennise
        probas = np.array([pred_melusine_bigru[:,1], pred_melusine_transf[:,1], pred_keras_lstm[:,1], pred_keras_bilstm[:,1]])
        probas = np.mean(probas, axis = 0)
        print(probas)
        classes = pred_melusine_bigru.argmax(axis = -1)
    
        output = pd.DataFrame({
            'classe' : classes,
            'confiance' : [round(pred_melusine_bigru[k,classes[k]], 2) for k in range(len(classes))]
        })
        return pd.concat([INPUT_MAIL_DF, output], axis = 1)

In [34]:
model = Predicteur()

In [45]:
INPUT_MAIL_DF = pd.DataFrame({
    'header':[
        'ami',
        'ano cr',
        'ano synergie'
        ],
    'body':[
        "bonjour n'oubliez pas de renseigner votre mot de passe",
        "bonjour j'ai un souci au moment de générer le cr",
        "bonjour j'ai un souci au moment de générer la table"
        ]
})
y_pred = model.predict(INPUT_MAIL_DF)
y_pred

[0.7063404  0.99983585 0.500094  ]


Unnamed: 0,header,body,classe,confiance
0,ami,bonjour n'oubliez pas de renseigner votre mot ...,0,1.0
1,ano cr,bonjour j'ai un souci au moment de générer le cr,1,1.0
2,ano synergie,bonjour j'ai un souci au moment de générer la ...,0,1.0
